blob: e660834b4788fe95cf58872e9b7b261d7f1dbcb9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinnerba3d67c2020-12-26 00:41:46 +010044#include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
Victor Stinner47e1afd2020-10-26 16:43:47 +010045#include "pycore_bytes_methods.h" // _Py_bytes_lower()
Serhiy Storchaka2ad93822020-12-03 12:46:16 +020046#include "pycore_format.h" // F_LJUST
Victor Stinner47e1afd2020-10-26 16:43:47 +010047#include "pycore_initconfig.h" // _PyStatus_OK()
48#include "pycore_interp.h" // PyInterpreterState.fs_codec
49#include "pycore_object.h" // _PyObject_GC_TRACK()
50#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
51#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
52#include "pycore_pystate.h" // _PyInterpreterState_GET()
53#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
54#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000056#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000057#include <windows.h>
58#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059
Jakub Kulík9032cf52021-04-30 15:21:42 +020060#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
61#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
62#endif
63
Victor Stinner666ecfb2020-07-02 01:19:57 +020064/* Uncomment to display statistics on interned strings at exit
65 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010066/* #define INTERNED_STATS 1 */
67
68
Larry Hastings61272b72014-01-07 12:41:53 -080069/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090070class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080071[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090072/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
73
74/*[python input]
75class Py_UCS4_converter(CConverter):
76 type = 'Py_UCS4'
77 converter = 'convert_uc'
78
79 def converter_init(self):
80 if self.default is not unspecified:
81 self.c_default = ascii(self.default)
82 if len(self.c_default) > 4 or self.c_default[0] != "'":
83 self.c_default = hex(ord(self.default))
84
85[python start generated code]*/
86/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080087
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088/* --- Globals ------------------------------------------------------------
89
Serhiy Storchaka05997252013-01-26 12:14:02 +020090NOTE: In the interpreter's initialization phase, some globals are currently
91 initialized dynamically as needed. In the process Unicode objects may
92 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000093
94*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000095
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000096
97#ifdef __cplusplus
98extern "C" {
99#endif
100
Victor Stinner99768342021-03-17 21:46:53 +0100101// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
102// The value must be the same in fileutils.c.
Victor Stinner8faf8212011-12-08 22:14:11 +0100103#define MAX_UNICODE 0x10ffff
104
Victor Stinner910337b2011-10-03 03:20:16 +0200105#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200106# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#else
108# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
109#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200110
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111#define _PyUnicode_UTF8(op) \
112 (((PyCompactUnicodeObject*)(op))->utf8)
113#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200114 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((char*)((PyASCIIObject*)(op) + 1)) : \
118 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200119#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200120 (((PyCompactUnicodeObject*)(op))->utf8_length)
121#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200122 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200123 assert(PyUnicode_IS_READY(op)), \
124 PyUnicode_IS_COMPACT_ASCII(op) ? \
125 ((PyASCIIObject*)(op))->length : \
126 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200127#define _PyUnicode_WSTR(op) \
128 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900129
130/* Don't use deprecated macro of unicodeobject.h */
131#undef PyUnicode_WSTR_LENGTH
132#define PyUnicode_WSTR_LENGTH(op) \
133 (PyUnicode_IS_COMPACT_ASCII(op) ? \
134 ((PyASCIIObject*)op)->length : \
135 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200136#define _PyUnicode_WSTR_LENGTH(op) \
137 (((PyCompactUnicodeObject*)(op))->wstr_length)
138#define _PyUnicode_LENGTH(op) \
139 (((PyASCIIObject *)(op))->length)
140#define _PyUnicode_STATE(op) \
141 (((PyASCIIObject *)(op))->state)
142#define _PyUnicode_HASH(op) \
143 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200144#define _PyUnicode_KIND(op) \
145 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200146 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200147#define _PyUnicode_GET_LENGTH(op) \
148 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200149 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200150#define _PyUnicode_DATA_ANY(op) \
151 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200152
Victor Stinner910337b2011-10-03 03:20:16 +0200153#undef PyUnicode_READY
154#define PyUnicode_READY(op) \
155 (assert(_PyUnicode_CHECK(op)), \
156 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200157 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100158 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200159
Victor Stinnerc379ead2011-10-03 12:52:27 +0200160#define _PyUnicode_SHARE_UTF8(op) \
161 (assert(_PyUnicode_CHECK(op)), \
162 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
163 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
164#define _PyUnicode_SHARE_WSTR(op) \
165 (assert(_PyUnicode_CHECK(op)), \
166 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
167
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168/* true if the Unicode object has an allocated UTF-8 memory block
169 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200170#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
174
Victor Stinner03490912011-10-03 23:45:12 +0200175/* true if the Unicode object has an allocated wstr memory block
176 (not shared with other data) */
177#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200178 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200179 (!PyUnicode_IS_READY(op) || \
180 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
181
Victor Stinner910337b2011-10-03 03:20:16 +0200182/* Generic helper macro to convert characters of different types.
183 from_type and to_type have to be valid type names, begin and end
184 are pointers to the source characters which should be of type
185 "from_type *". to is a pointer of type "to_type *" and points to the
186 buffer where the result characters are written to. */
187#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
188 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100189 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600190 const from_type *_iter = (const from_type *)(begin);\
191 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200192 Py_ssize_t n = (_end) - (_iter); \
193 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200194 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_unrolled_end)) { \
196 _to[0] = (to_type) _iter[0]; \
197 _to[1] = (to_type) _iter[1]; \
198 _to[2] = (to_type) _iter[2]; \
199 _to[3] = (to_type) _iter[3]; \
200 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200201 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200202 while (_iter < (_end)) \
203 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200204 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200205
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200206#ifdef MS_WINDOWS
207 /* On Windows, overallocate by 50% is the best factor */
208# define OVERALLOCATE_FACTOR 2
209#else
210 /* On Linux, overallocate by 25% is the best factor */
211# define OVERALLOCATE_FACTOR 4
212#endif
213
Walter Dörwald16807132007-05-25 13:52:07 +0000214
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200215static struct _Py_unicode_state*
216get_unicode_state(void)
217{
218 PyInterpreterState *interp = _PyInterpreterState_GET();
219 return &interp->unicode;
220}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200223// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static inline PyObject* unicode_get_empty(void)
225{
226 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200227 // unicode_get_empty() must not be called before _PyUnicode_Init()
228 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200229 assert(state->empty_string != NULL);
230 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200231}
232
Victor Stinner91698d82020-06-25 14:07:40 +0200233
234// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200235static inline PyObject* unicode_new_empty(void)
236{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200237 PyObject *empty = unicode_get_empty();
238 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200239 return empty;
240}
241
242#define _Py_RETURN_UNICODE_EMPTY() \
243 do { \
244 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200245 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000246
Victor Stinner59423e32018-11-26 13:40:01 +0100247static inline void
248unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
249 Py_ssize_t start, Py_ssize_t length)
250{
251 assert(0 <= start);
252 assert(kind != PyUnicode_WCHAR_KIND);
253 switch (kind) {
254 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS1 ch = (unsigned char)value;
257 Py_UCS1 *to = (Py_UCS1 *)data + start;
258 memset(to, ch, length);
259 break;
260 }
261 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100262 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100263 Py_UCS2 ch = (Py_UCS2)value;
264 Py_UCS2 *to = (Py_UCS2 *)data + start;
265 const Py_UCS2 *end = to + length;
266 for (; to < end; ++to) *to = ch;
267 break;
268 }
269 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100270 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100271 Py_UCS4 ch = value;
272 Py_UCS4 * to = (Py_UCS4 *)data + start;
273 const Py_UCS4 *end = to + length;
274 for (; to < end; ++to) *to = ch;
275 break;
276 }
277 default: Py_UNREACHABLE();
278 }
279}
280
281
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200282/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700283static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200284_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900285static inline void
286_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400287static PyObject *
288unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
289 const char *errors);
290static PyObject *
291unicode_decode_utf8(const char *s, Py_ssize_t size,
292 _Py_error_handler error_handler, const char *errors,
293 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200294
Christian Heimes190d79e2008-01-30 11:58:22 +0000295/* Fast detection of the most frequent whitespace characters */
296const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000298/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000299/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000300/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x000C: * FORM FEED */
302/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 0, 1, 1, 1, 1, 1, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000305/* case 0x001C: * FILE SEPARATOR */
306/* case 0x001D: * GROUP SEPARATOR */
307/* case 0x001E: * RECORD SEPARATOR */
308/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000310/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000311 1, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000315
Benjamin Peterson14339b62009-01-31 16:36:08 +0000316 0, 0, 0, 0, 0, 0, 0, 0,
317 0, 0, 0, 0, 0, 0, 0, 0,
318 0, 0, 0, 0, 0, 0, 0, 0,
319 0, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321 0, 0, 0, 0, 0, 0, 0, 0,
322 0, 0, 0, 0, 0, 0, 0, 0,
323 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000324};
325
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200326/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200327static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200328static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100329static int unicode_modifiable(PyObject *unicode);
330
Victor Stinnerfe226c02011-10-03 03:52:20 +0200331
Alexander Belopolsky40018472011-02-26 01:02:56 +0000332static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100333_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200334static PyObject *
335_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
336static PyObject *
337_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
338
339static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000340unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000341 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100342 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000343 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
344
Alexander Belopolsky40018472011-02-26 01:02:56 +0000345static void
346raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300347 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100348 PyObject *unicode,
349 Py_ssize_t startpos, Py_ssize_t endpos,
350 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000351
Christian Heimes190d79e2008-01-30 11:58:22 +0000352/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200353static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000355/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000356/* 0x000B, * LINE TABULATION */
357/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000358/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000359 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000360 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000361/* 0x001C, * FILE SEPARATOR */
362/* 0x001D, * GROUP SEPARATOR */
363/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000364 0, 0, 0, 0, 1, 1, 1, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000369
Benjamin Peterson14339b62009-01-31 16:36:08 +0000370 0, 0, 0, 0, 0, 0, 0, 0,
371 0, 0, 0, 0, 0, 0, 0, 0,
372 0, 0, 0, 0, 0, 0, 0, 0,
373 0, 0, 0, 0, 0, 0, 0, 0,
374 0, 0, 0, 0, 0, 0, 0, 0,
375 0, 0, 0, 0, 0, 0, 0, 0,
376 0, 0, 0, 0, 0, 0, 0, 0,
377 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000378};
379
INADA Naoki3ae20562017-01-16 20:41:20 +0900380static int convert_uc(PyObject *obj, void *addr);
381
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300382#include "clinic/unicodeobject.c.h"
383
Victor Stinner3d4226a2018-08-29 22:21:32 +0200384_Py_error_handler
385_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200386{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200387 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200388 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200391 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200394 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
396 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200397 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 }
399 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200400 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200403 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
Victor Stinner50149202015-09-22 00:26:54 +0200408 return _Py_ERROR_OTHER;
409}
410
Victor Stinner709d23d2019-05-02 14:56:30 -0400411
412static _Py_error_handler
413get_error_handler_wide(const wchar_t *errors)
414{
415 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
416 return _Py_ERROR_STRICT;
417 }
418 if (wcscmp(errors, L"surrogateescape") == 0) {
419 return _Py_ERROR_SURROGATEESCAPE;
420 }
421 if (wcscmp(errors, L"replace") == 0) {
422 return _Py_ERROR_REPLACE;
423 }
424 if (wcscmp(errors, L"ignore") == 0) {
425 return _Py_ERROR_IGNORE;
426 }
427 if (wcscmp(errors, L"backslashreplace") == 0) {
428 return _Py_ERROR_BACKSLASHREPLACE;
429 }
430 if (wcscmp(errors, L"surrogatepass") == 0) {
431 return _Py_ERROR_SURROGATEPASS;
432 }
433 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
434 return _Py_ERROR_XMLCHARREFREPLACE;
435 }
436 return _Py_ERROR_OTHER;
437}
438
439
Victor Stinner22eb6892019-06-26 00:51:05 +0200440static inline int
441unicode_check_encoding_errors(const char *encoding, const char *errors)
442{
443 if (encoding == NULL && errors == NULL) {
444 return 0;
445 }
446
Victor Stinner81a7be32020-04-14 15:14:01 +0200447 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200448#ifndef Py_DEBUG
449 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200450 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200451 return 0;
452 }
453#else
454 /* Always check in debug mode */
455#endif
456
457 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
458 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200459 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200460 return 0;
461 }
462
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200463 /* Disable checks during Python finalization. For example, it allows to
464 call _PyObject_Dump() during finalization for debugging purpose. */
465 if (interp->finalizing) {
466 return 0;
467 }
468
Victor Stinner22eb6892019-06-26 00:51:05 +0200469 if (encoding != NULL) {
470 PyObject *handler = _PyCodec_Lookup(encoding);
471 if (handler == NULL) {
472 return -1;
473 }
474 Py_DECREF(handler);
475 }
476
477 if (errors != NULL) {
478 PyObject *handler = PyCodec_LookupError(errors);
479 if (handler == NULL) {
480 return -1;
481 }
482 Py_DECREF(handler);
483 }
484 return 0;
485}
486
487
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200488int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100489_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200490{
Victor Stinner68762572019-10-07 18:42:01 +0200491#define CHECK(expr) \
492 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
493
Victor Stinner910337b2011-10-03 03:20:16 +0200494 PyASCIIObject *ascii;
495 unsigned int kind;
496
Victor Stinner68762572019-10-07 18:42:01 +0200497 assert(op != NULL);
498 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200499
500 ascii = (PyASCIIObject *)op;
501 kind = ascii->state.kind;
502
Victor Stinnera3b334d2011-10-03 13:53:37 +0200503 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200504 CHECK(kind == PyUnicode_1BYTE_KIND);
505 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200506 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200507 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200508 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200509 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200510
Victor Stinnera41463c2011-10-04 01:05:08 +0200511 if (ascii->state.compact == 1) {
512 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200513 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200514 || kind == PyUnicode_2BYTE_KIND
515 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200516 CHECK(ascii->state.ascii == 0);
517 CHECK(ascii->state.ready == 1);
518 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100519 }
520 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200521 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
522
523 data = unicode->data.any;
524 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(ascii->length == 0);
526 CHECK(ascii->hash == -1);
527 CHECK(ascii->state.compact == 0);
528 CHECK(ascii->state.ascii == 0);
529 CHECK(ascii->state.ready == 0);
530 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
531 CHECK(ascii->wstr != NULL);
532 CHECK(data == NULL);
533 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200534 }
535 else {
Victor Stinner68762572019-10-07 18:42:01 +0200536 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200537 || kind == PyUnicode_2BYTE_KIND
538 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200539 CHECK(ascii->state.compact == 0);
540 CHECK(ascii->state.ready == 1);
541 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200543 CHECK(compact->utf8 == data);
544 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200545 }
546 else
Victor Stinner68762572019-10-07 18:42:01 +0200547 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200548 }
549 }
550 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200551 if (
552#if SIZEOF_WCHAR_T == 2
553 kind == PyUnicode_2BYTE_KIND
554#else
555 kind == PyUnicode_4BYTE_KIND
556#endif
557 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200558 {
Victor Stinner68762572019-10-07 18:42:01 +0200559 CHECK(ascii->wstr == data);
560 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200561 } else
Victor Stinner68762572019-10-07 18:42:01 +0200562 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200563 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200564
565 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200566 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200567 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200569 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200570
571 /* check that the best kind is used: O(n) operation */
572 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200573 Py_ssize_t i;
574 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300575 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200576 Py_UCS4 ch;
577
578 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200579 for (i=0; i < ascii->length; i++)
580 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200581 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200582 if (ch > maxchar)
583 maxchar = ch;
584 }
585 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100586 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200587 CHECK(maxchar >= 128);
588 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100589 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200590 else
Victor Stinner68762572019-10-07 18:42:01 +0200591 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200592 }
Victor Stinner77faf692011-11-20 18:56:05 +0100593 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200594 CHECK(maxchar >= 0x100);
595 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100596 }
597 else {
Victor Stinner68762572019-10-07 18:42:01 +0200598 CHECK(maxchar >= 0x10000);
599 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100600 }
Victor Stinner68762572019-10-07 18:42:01 +0200601 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400603 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200604
605#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400606}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200607
Victor Stinner910337b2011-10-03 03:20:16 +0200608
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100609static PyObject*
610unicode_result_wchar(PyObject *unicode)
611{
612#ifndef Py_DEBUG
613 Py_ssize_t len;
614
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100615 len = _PyUnicode_WSTR_LENGTH(unicode);
616 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100617 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200618 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100619 }
620
621 if (len == 1) {
622 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100623 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100624 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200625 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100626 }
627 }
628
629 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200630 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 return NULL;
632 }
633#else
Victor Stinneraa771272012-10-04 02:32:58 +0200634 assert(Py_REFCNT(unicode) == 1);
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 /* don't make the result ready in debug mode to ensure that the caller
637 makes the string ready before using it */
638 assert(_PyUnicode_CheckConsistency(unicode, 1));
639#endif
640 return unicode;
641}
642
643static PyObject*
644unicode_result_ready(PyObject *unicode)
645{
646 Py_ssize_t length;
647
648 length = PyUnicode_GET_LENGTH(unicode);
649 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200650 PyObject *empty = unicode_get_empty();
651 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100652 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200653 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100654 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200655 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100656 }
657
658 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200659 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200660 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakac43317d2021-06-12 20:44:32 +0300661 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200662 Py_UCS1 ch = data[0];
663 struct _Py_unicode_state *state = get_unicode_state();
664 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100665 if (latin1_char != NULL) {
666 if (unicode != latin1_char) {
667 Py_INCREF(latin1_char);
668 Py_DECREF(unicode);
669 }
670 return latin1_char;
671 }
672 else {
673 assert(_PyUnicode_CheckConsistency(unicode, 1));
674 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200675 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100676 return unicode;
677 }
678 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200679 else {
680 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
681 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100682 }
683
684 assert(_PyUnicode_CheckConsistency(unicode, 1));
685 return unicode;
686}
687
688static PyObject*
689unicode_result(PyObject *unicode)
690{
691 assert(_PyUnicode_CHECK(unicode));
692 if (PyUnicode_IS_READY(unicode))
693 return unicode_result_ready(unicode);
694 else
695 return unicode_result_wchar(unicode);
696}
697
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698static PyObject*
699unicode_result_unchanged(PyObject *unicode)
700{
701 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500702 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100703 return NULL;
704 Py_INCREF(unicode);
705 return unicode;
706 }
707 else
708 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100709 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100710}
711
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200712/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
713 ASCII, Latin1, UTF-8, etc. */
714static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200715backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200716 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
717{
Victor Stinnerad771582015-10-09 12:38:53 +0200718 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200719 Py_UCS4 ch;
720 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300721 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200722
723 assert(PyUnicode_IS_READY(unicode));
724 kind = PyUnicode_KIND(unicode);
725 data = PyUnicode_DATA(unicode);
726
727 size = 0;
728 /* determine replacement size */
729 for (i = collstart; i < collend; ++i) {
730 Py_ssize_t incr;
731
732 ch = PyUnicode_READ(kind, data, i);
733 if (ch < 0x100)
734 incr = 2+2;
735 else if (ch < 0x10000)
736 incr = 2+4;
737 else {
738 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200739 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200740 }
741 if (size > PY_SSIZE_T_MAX - incr) {
742 PyErr_SetString(PyExc_OverflowError,
743 "encoded result is too long for a Python string");
744 return NULL;
745 }
746 size += incr;
747 }
748
Victor Stinnerad771582015-10-09 12:38:53 +0200749 str = _PyBytesWriter_Prepare(writer, str, size);
750 if (str == NULL)
751 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200752
753 /* generate replacement */
754 for (i = collstart; i < collend; ++i) {
755 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200756 *str++ = '\\';
757 if (ch >= 0x00010000) {
758 *str++ = 'U';
759 *str++ = Py_hexdigits[(ch>>28)&0xf];
760 *str++ = Py_hexdigits[(ch>>24)&0xf];
761 *str++ = Py_hexdigits[(ch>>20)&0xf];
762 *str++ = Py_hexdigits[(ch>>16)&0xf];
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200765 }
Victor Stinner797485e2015-10-09 03:17:30 +0200766 else if (ch >= 0x100) {
767 *str++ = 'u';
768 *str++ = Py_hexdigits[(ch>>12)&0xf];
769 *str++ = Py_hexdigits[(ch>>8)&0xf];
770 }
771 else
772 *str++ = 'x';
773 *str++ = Py_hexdigits[(ch>>4)&0xf];
774 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200775 }
776 return str;
777}
778
779/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
780 ASCII, Latin1, UTF-8, etc. */
781static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200782xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200783 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
784{
Victor Stinnerad771582015-10-09 12:38:53 +0200785 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200786 Py_UCS4 ch;
787 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300788 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200789
790 assert(PyUnicode_IS_READY(unicode));
791 kind = PyUnicode_KIND(unicode);
792 data = PyUnicode_DATA(unicode);
793
794 size = 0;
795 /* determine replacement size */
796 for (i = collstart; i < collend; ++i) {
797 Py_ssize_t incr;
798
799 ch = PyUnicode_READ(kind, data, i);
800 if (ch < 10)
801 incr = 2+1+1;
802 else if (ch < 100)
803 incr = 2+2+1;
804 else if (ch < 1000)
805 incr = 2+3+1;
806 else if (ch < 10000)
807 incr = 2+4+1;
808 else if (ch < 100000)
809 incr = 2+5+1;
810 else if (ch < 1000000)
811 incr = 2+6+1;
812 else {
813 assert(ch <= MAX_UNICODE);
814 incr = 2+7+1;
815 }
816 if (size > PY_SSIZE_T_MAX - incr) {
817 PyErr_SetString(PyExc_OverflowError,
818 "encoded result is too long for a Python string");
819 return NULL;
820 }
821 size += incr;
822 }
823
Victor Stinnerad771582015-10-09 12:38:53 +0200824 str = _PyBytesWriter_Prepare(writer, str, size);
825 if (str == NULL)
826 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200827
828 /* generate replacement */
829 for (i = collstart; i < collend; ++i) {
Christian Heimes07f2ade2020-11-18 16:38:53 +0100830 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
831 if (size < 0) {
832 return NULL;
833 }
834 str += size;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200835 }
836 return str;
837}
838
Thomas Wouters477c8d52006-05-27 19:21:47 +0000839/* --- Bloom Filters ----------------------------------------------------- */
840
841/* stuff to implement simple "bloom filters" for Unicode characters.
842 to keep things simple, we use a single bitmask, using the least 5
843 bits from each unicode characters as the bit index. */
844
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200845/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000846
Antoine Pitrouf068f942010-01-13 14:19:12 +0000847#if LONG_BIT >= 128
848#define BLOOM_WIDTH 128
849#elif LONG_BIT >= 64
850#define BLOOM_WIDTH 64
851#elif LONG_BIT >= 32
852#define BLOOM_WIDTH 32
853#else
854#error "LONG_BIT is smaller than 32"
855#endif
856
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857#define BLOOM_MASK unsigned long
858
Serhiy Storchaka05997252013-01-26 12:14:02 +0200859static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860
Antoine Pitrouf068f942010-01-13 14:19:12 +0000861#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000862
Benjamin Peterson29060642009-01-31 22:14:21 +0000863#define BLOOM_LINEBREAK(ch) \
864 ((ch) < 128U ? ascii_linebreak[(ch)] : \
865 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000866
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700867static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300868make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000869{
Victor Stinnera85af502013-04-09 21:53:54 +0200870#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
871 do { \
872 TYPE *data = (TYPE *)PTR; \
873 TYPE *end = data + LEN; \
874 Py_UCS4 ch; \
875 for (; data != end; data++) { \
876 ch = *data; \
877 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
878 } \
879 break; \
880 } while (0)
881
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882 /* calculate simple bloom-style bitmask for a given unicode string */
883
Antoine Pitrouf068f942010-01-13 14:19:12 +0000884 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000885
886 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200887 switch (kind) {
888 case PyUnicode_1BYTE_KIND:
889 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
890 break;
891 case PyUnicode_2BYTE_KIND:
892 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
893 break;
894 case PyUnicode_4BYTE_KIND:
895 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
896 break;
897 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700898 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200899 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000900 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200901
902#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000903}
904
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300905static int
906ensure_unicode(PyObject *obj)
907{
908 if (!PyUnicode_Check(obj)) {
909 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200910 "must be str, not %.100s",
911 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300912 return -1;
913 }
914 return PyUnicode_READY(obj);
915}
916
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200917/* Compilation of templated routines */
918
Victor Stinner90ed8a62020-06-24 00:34:07 +0200919#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200920
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200921#include "stringlib/asciilib.h"
922#include "stringlib/fastsearch.h"
923#include "stringlib/partition.h"
924#include "stringlib/split.h"
925#include "stringlib/count.h"
926#include "stringlib/find.h"
927#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200928#include "stringlib/undef.h"
929
930#include "stringlib/ucs1lib.h"
931#include "stringlib/fastsearch.h"
932#include "stringlib/partition.h"
933#include "stringlib/split.h"
934#include "stringlib/count.h"
935#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300936#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200937#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200938#include "stringlib/undef.h"
939
940#include "stringlib/ucs2lib.h"
941#include "stringlib/fastsearch.h"
942#include "stringlib/partition.h"
943#include "stringlib/split.h"
944#include "stringlib/count.h"
945#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300946#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200947#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200948#include "stringlib/undef.h"
949
950#include "stringlib/ucs4lib.h"
951#include "stringlib/fastsearch.h"
952#include "stringlib/partition.h"
953#include "stringlib/split.h"
954#include "stringlib/count.h"
955#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300956#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200957#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200958#include "stringlib/undef.h"
959
Inada Naoki2c4928d2020-06-17 20:09:44 +0900960_Py_COMP_DIAG_PUSH
961_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200962#include "stringlib/unicodedefs.h"
963#include "stringlib/fastsearch.h"
964#include "stringlib/count.h"
965#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100966#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900967_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200968
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200969#undef STRINGLIB_GET_EMPTY
970
Guido van Rossumd57fd912000-03-10 22:53:23 +0000971/* --- Unicode Object ----------------------------------------------------- */
972
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700973static inline Py_ssize_t
974findchar(const void *s, int kind,
975 Py_ssize_t size, Py_UCS4 ch,
976 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200978 switch (kind) {
979 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200980 if ((Py_UCS1) ch != ch)
981 return -1;
982 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600983 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200984 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600985 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200987 if ((Py_UCS2) ch != ch)
988 return -1;
989 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600990 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200991 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600992 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200993 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200994 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600995 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200996 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600997 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200998 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700999 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001}
1002
Victor Stinnerafffce42012-10-03 23:03:17 +02001003#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001004/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001005 earlier.
1006
1007 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1008 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1009 invalid character in Unicode 6.0. */
1010static void
1011unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1012{
1013 int kind = PyUnicode_KIND(unicode);
1014 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1015 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1016 if (length <= old_length)
1017 return;
1018 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1019}
1020#endif
1021
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022static PyObject*
1023resize_compact(PyObject *unicode, Py_ssize_t length)
1024{
1025 Py_ssize_t char_size;
1026 Py_ssize_t struct_size;
1027 Py_ssize_t new_size;
1028 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001029 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001030#ifdef Py_DEBUG
1031 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1032#endif
1033
Victor Stinner79891572012-05-03 13:43:07 +02001034 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001036 assert(PyUnicode_IS_COMPACT(unicode));
1037
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001038 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001039 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 struct_size = sizeof(PyASCIIObject);
1041 else
1042 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001043 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001044
Victor Stinnerfe226c02011-10-03 03:52:20 +02001045 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1046 PyErr_NoMemory();
1047 return NULL;
1048 }
1049 new_size = (struct_size + (length + 1) * char_size);
1050
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001051 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001052 PyObject_Free(_PyUnicode_UTF8(unicode));
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001053 _PyUnicode_UTF8(unicode) = NULL;
1054 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1055 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001056#ifdef Py_REF_DEBUG
1057 _Py_RefTotal--;
1058#endif
1059#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001060 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001061#endif
Victor Stinner84def372011-12-11 20:04:56 +01001062
Victor Stinner32bd68c2020-12-01 10:37:39 +01001063 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001064 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001065 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001066 PyErr_NoMemory();
1067 return NULL;
1068 }
Victor Stinner84def372011-12-11 20:04:56 +01001069 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001071
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001073 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001075 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001076 _PyUnicode_WSTR_LENGTH(unicode) = length;
1077 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001078 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001079 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001080 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001081 if (!PyUnicode_IS_ASCII(unicode))
1082 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001083 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001084#ifdef Py_DEBUG
1085 unicode_fill_invalid(unicode, old_length);
1086#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001087 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1088 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001089 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001090 return unicode;
1091}
1092
Alexander Belopolsky40018472011-02-26 01:02:56 +00001093static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001094resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095{
Victor Stinner95663112011-10-04 01:03:50 +02001096 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001097 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001099 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001100
Victor Stinnerfe226c02011-10-03 03:52:20 +02001101 if (PyUnicode_IS_READY(unicode)) {
1102 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001103 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001104 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001105#ifdef Py_DEBUG
1106 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1107#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001108
1109 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001110 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001111 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1112 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001113
1114 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1115 PyErr_NoMemory();
1116 return -1;
1117 }
1118 new_size = (length + 1) * char_size;
1119
Victor Stinner7a9105a2011-12-12 00:13:42 +01001120 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1121 {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001122 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinner7a9105a2011-12-12 00:13:42 +01001123 _PyUnicode_UTF8(unicode) = NULL;
1124 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1125 }
1126
Victor Stinner32bd68c2020-12-01 10:37:39 +01001127 data = (PyObject *)PyObject_Realloc(data, new_size);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001128 if (data == NULL) {
1129 PyErr_NoMemory();
1130 return -1;
1131 }
1132 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001133 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001134 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001135 _PyUnicode_WSTR_LENGTH(unicode) = length;
1136 }
1137 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001138 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001139 _PyUnicode_UTF8_LENGTH(unicode) = length;
1140 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001141 _PyUnicode_LENGTH(unicode) = length;
1142 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001143#ifdef Py_DEBUG
1144 unicode_fill_invalid(unicode, old_length);
1145#endif
Victor Stinner95663112011-10-04 01:03:50 +02001146 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001147 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001148 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001150 }
Victor Stinner95663112011-10-04 01:03:50 +02001151 assert(_PyUnicode_WSTR(unicode) != NULL);
1152
1153 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001154 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001155 PyErr_NoMemory();
1156 return -1;
1157 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001158 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001159 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001160 wstr = PyObject_Realloc(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001161 if (!wstr) {
1162 PyErr_NoMemory();
1163 return -1;
1164 }
1165 _PyUnicode_WSTR(unicode) = wstr;
1166 _PyUnicode_WSTR(unicode)[length] = 0;
1167 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001168 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 return 0;
1170}
1171
Victor Stinnerfe226c02011-10-03 03:52:20 +02001172static PyObject*
1173resize_copy(PyObject *unicode, Py_ssize_t length)
1174{
1175 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001176 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001177 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001178
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001179 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180
1181 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1182 if (copy == NULL)
1183 return NULL;
1184
1185 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001186 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001187 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001188 }
1189 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001190 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001191
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001192 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001193 if (w == NULL)
1194 return NULL;
1195 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1196 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001197 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001198 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001199 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001200 }
1201}
1202
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001204 Ux0000 terminated; some code (e.g. new_identifier)
1205 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206
1207 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001208 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210*/
1211
Alexander Belopolsky40018472011-02-26 01:02:56 +00001212static PyUnicodeObject *
1213_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001215 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
Thomas Wouters477c8d52006-05-27 19:21:47 +00001218 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001219 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001220 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 }
1222
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001223 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001224 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001225 return (PyUnicodeObject *)PyErr_NoMemory();
1226 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 if (length < 0) {
1228 PyErr_SetString(PyExc_SystemError,
1229 "Negative size passed to _PyUnicode_New");
1230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 }
1232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1234 if (unicode == NULL)
1235 return NULL;
1236 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001237
1238 _PyUnicode_WSTR_LENGTH(unicode) = length;
1239 _PyUnicode_HASH(unicode) = -1;
1240 _PyUnicode_STATE(unicode).interned = 0;
1241 _PyUnicode_STATE(unicode).kind = 0;
1242 _PyUnicode_STATE(unicode).compact = 0;
1243 _PyUnicode_STATE(unicode).ready = 0;
1244 _PyUnicode_STATE(unicode).ascii = 0;
1245 _PyUnicode_DATA_ANY(unicode) = NULL;
1246 _PyUnicode_LENGTH(unicode) = 0;
1247 _PyUnicode_UTF8(unicode) = NULL;
1248 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1249
Victor Stinner32bd68c2020-12-01 10:37:39 +01001250 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001252 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001253 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001254 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001256
Jeremy Hyltond8082792003-09-16 19:41:39 +00001257 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001258 * the caller fails before initializing str -- unicode_resize()
1259 * reads str[0], and the Keep-Alive optimization can keep memory
1260 * allocated for str alive across a call to unicode_dealloc(unicode).
1261 * We don't want unicode_resize to read uninitialized memory in
1262 * that case.
1263 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264 _PyUnicode_WSTR(unicode)[0] = 0;
1265 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001266
Victor Stinner7931d9a2011-11-04 00:22:48 +01001267 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 return unicode;
1269}
1270
Victor Stinnerf42dc442011-10-02 23:33:16 +02001271static const char*
1272unicode_kind_name(PyObject *unicode)
1273{
Victor Stinner42dfd712011-10-03 14:41:45 +02001274 /* don't check consistency: unicode_kind_name() is called from
1275 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001276 if (!PyUnicode_IS_COMPACT(unicode))
1277 {
1278 if (!PyUnicode_IS_READY(unicode))
1279 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001280 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001281 {
1282 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001283 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 return "legacy ascii";
1285 else
1286 return "legacy latin1";
1287 case PyUnicode_2BYTE_KIND:
1288 return "legacy UCS2";
1289 case PyUnicode_4BYTE_KIND:
1290 return "legacy UCS4";
1291 default:
1292 return "<legacy invalid kind>";
1293 }
1294 }
1295 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001296 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001297 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001298 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001299 return "ascii";
1300 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001301 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001302 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001303 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001304 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001305 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001306 default:
1307 return "<invalid compact kind>";
1308 }
1309}
1310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001313const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001314 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001315 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316}
1317
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001318const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001319 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 return _PyUnicode_COMPACT_DATA(unicode);
1321}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001322const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001323 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001324 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1326 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1327 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1328 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1329 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1330 return PyUnicode_DATA(unicode);
1331}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001332
1333void
1334_PyUnicode_Dump(PyObject *op)
1335{
1336 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001337 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1338 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001339 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001340
Victor Stinnera849a4b2011-10-03 12:12:11 +02001341 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001342 {
1343 if (ascii->state.ascii)
1344 data = (ascii + 1);
1345 else
1346 data = (compact + 1);
1347 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001348 else
1349 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001350 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001351
Victor Stinnera849a4b2011-10-03 12:12:11 +02001352 if (ascii->wstr == data)
1353 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001354 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001355
Victor Stinnera3b334d2011-10-03 13:53:37 +02001356 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001357 printf(" (%zu), ", compact->wstr_length);
1358 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001359 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001360 }
1361 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001362 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001363 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001364}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365#endif
1366
Victor Stinner91698d82020-06-25 14:07:40 +02001367static int
1368unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1369{
1370 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1371 // optimized to always use state->empty_string without having to check if
1372 // it is NULL or not.
1373 PyObject *empty = PyUnicode_New(1, 0);
1374 if (empty == NULL) {
1375 return -1;
1376 }
1377 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1378 _PyUnicode_LENGTH(empty) = 0;
1379 assert(_PyUnicode_CheckConsistency(empty, 1));
1380
1381 assert(state->empty_string == NULL);
1382 state->empty_string = empty;
1383 return 0;
1384}
1385
1386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387PyObject *
1388PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1389{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001390 /* Optimization for empty strings */
1391 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001392 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001393 }
1394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 PyObject *obj;
1396 PyCompactUnicodeObject *unicode;
1397 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001398 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001399 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 Py_ssize_t char_size;
1401 Py_ssize_t struct_size;
1402
Victor Stinner9e9d6892011-10-04 01:02:02 +02001403 is_ascii = 0;
1404 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 struct_size = sizeof(PyCompactUnicodeObject);
1406 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001407 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 char_size = 1;
1409 is_ascii = 1;
1410 struct_size = sizeof(PyASCIIObject);
1411 }
1412 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001413 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 char_size = 1;
1415 }
1416 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001417 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 char_size = 2;
1419 if (sizeof(wchar_t) == 2)
1420 is_sharing = 1;
1421 }
1422 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001423 if (maxchar > MAX_UNICODE) {
1424 PyErr_SetString(PyExc_SystemError,
1425 "invalid maximum character passed to PyUnicode_New");
1426 return NULL;
1427 }
Victor Stinner8f825062012-04-27 13:55:39 +02001428 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 char_size = 4;
1430 if (sizeof(wchar_t) == 4)
1431 is_sharing = 1;
1432 }
1433
1434 /* Ensure we won't overflow the size. */
1435 if (size < 0) {
1436 PyErr_SetString(PyExc_SystemError,
1437 "Negative size passed to PyUnicode_New");
1438 return NULL;
1439 }
1440 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1441 return PyErr_NoMemory();
1442
1443 /* Duplicated allocation code from _PyObject_New() instead of a call to
1444 * PyObject_New() so we are able to allocate space for the object and
1445 * it's data buffer.
1446 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001447 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001448 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001450 }
1451 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452
1453 unicode = (PyCompactUnicodeObject *)obj;
1454 if (is_ascii)
1455 data = ((PyASCIIObject*)obj) + 1;
1456 else
1457 data = unicode + 1;
1458 _PyUnicode_LENGTH(unicode) = size;
1459 _PyUnicode_HASH(unicode) = -1;
1460 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001461 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 _PyUnicode_STATE(unicode).compact = 1;
1463 _PyUnicode_STATE(unicode).ready = 1;
1464 _PyUnicode_STATE(unicode).ascii = is_ascii;
1465 if (is_ascii) {
1466 ((char*)data)[size] = 0;
1467 _PyUnicode_WSTR(unicode) = NULL;
1468 }
Victor Stinner8f825062012-04-27 13:55:39 +02001469 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 ((char*)data)[size] = 0;
1471 _PyUnicode_WSTR(unicode) = NULL;
1472 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001474 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 else {
1477 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001478 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001479 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001481 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 ((Py_UCS4*)data)[size] = 0;
1483 if (is_sharing) {
1484 _PyUnicode_WSTR_LENGTH(unicode) = size;
1485 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1486 }
1487 else {
1488 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489 _PyUnicode_WSTR(unicode) = NULL;
1490 }
1491 }
Victor Stinner8f825062012-04-27 13:55:39 +02001492#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001493 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001494#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001495 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 return obj;
1497}
1498
1499#if SIZEOF_WCHAR_T == 2
1500/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1501 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001502 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503
1504 This function assumes that unicode can hold one more code point than wstr
1505 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001506static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001508 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001509{
1510 const wchar_t *iter;
1511 Py_UCS4 *ucs4_out;
1512
Victor Stinner910337b2011-10-03 03:20:16 +02001513 assert(unicode != NULL);
1514 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1516 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1517
1518 for (iter = begin; iter < end; ) {
1519 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1520 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001521 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1522 && (iter+1) < end
1523 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524 {
Victor Stinner551ac952011-11-29 22:58:13 +01001525 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 iter += 2;
1527 }
1528 else {
1529 *ucs4_out++ = *iter;
1530 iter++;
1531 }
1532 }
1533 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1534 _PyUnicode_GET_LENGTH(unicode)));
1535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536}
1537#endif
1538
Victor Stinnercd9950f2011-10-02 00:34:53 +02001539static int
Victor Stinner488fa492011-12-12 00:01:39 +01001540unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001541{
Victor Stinner488fa492011-12-12 00:01:39 +01001542 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001543 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001544 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001545 return -1;
1546 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001547 return 0;
1548}
1549
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550static int
1551_copy_characters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001555 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001556 const void *from_data;
1557 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558
Victor Stinneree4544c2012-05-09 22:24:08 +02001559 assert(0 <= how_many);
1560 assert(0 <= from_start);
1561 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001562 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001564 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565
Victor Stinnerd3f08822012-05-29 12:57:52 +02001566 assert(PyUnicode_Check(to));
1567 assert(PyUnicode_IS_READY(to));
1568 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1569
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001570 if (how_many == 0)
1571 return 0;
1572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001574 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001576 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577
Victor Stinnerf1852262012-06-16 16:38:26 +02001578#ifdef Py_DEBUG
1579 if (!check_maxchar
1580 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1581 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001582 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001583 Py_UCS4 ch;
1584 Py_ssize_t i;
1585 for (i=0; i < how_many; i++) {
1586 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1587 assert(ch <= to_maxchar);
1588 }
1589 }
1590#endif
1591
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001592 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001593 if (check_maxchar
1594 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1595 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001596 /* Writing Latin-1 characters into an ASCII string requires to
1597 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001598 Py_UCS4 max_char;
1599 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001600 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001601 if (max_char >= 128)
1602 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001603 }
Christian Heimesf051e432016-09-13 20:22:02 +02001604 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001605 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001606 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001608 else if (from_kind == PyUnicode_1BYTE_KIND
1609 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001610 {
1611 _PyUnicode_CONVERT_BYTES(
1612 Py_UCS1, Py_UCS2,
1613 PyUnicode_1BYTE_DATA(from) + from_start,
1614 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1615 PyUnicode_2BYTE_DATA(to) + to_start
1616 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001617 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001618 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001619 && to_kind == PyUnicode_4BYTE_KIND)
1620 {
1621 _PyUnicode_CONVERT_BYTES(
1622 Py_UCS1, Py_UCS4,
1623 PyUnicode_1BYTE_DATA(from) + from_start,
1624 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1625 PyUnicode_4BYTE_DATA(to) + to_start
1626 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 }
1628 else if (from_kind == PyUnicode_2BYTE_KIND
1629 && to_kind == PyUnicode_4BYTE_KIND)
1630 {
1631 _PyUnicode_CONVERT_BYTES(
1632 Py_UCS2, Py_UCS4,
1633 PyUnicode_2BYTE_DATA(from) + from_start,
1634 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1635 PyUnicode_4BYTE_DATA(to) + to_start
1636 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001637 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001638 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001639 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1640
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001641 if (!check_maxchar) {
1642 if (from_kind == PyUnicode_2BYTE_KIND
1643 && to_kind == PyUnicode_1BYTE_KIND)
1644 {
1645 _PyUnicode_CONVERT_BYTES(
1646 Py_UCS2, Py_UCS1,
1647 PyUnicode_2BYTE_DATA(from) + from_start,
1648 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1649 PyUnicode_1BYTE_DATA(to) + to_start
1650 );
1651 }
1652 else if (from_kind == PyUnicode_4BYTE_KIND
1653 && to_kind == PyUnicode_1BYTE_KIND)
1654 {
1655 _PyUnicode_CONVERT_BYTES(
1656 Py_UCS4, Py_UCS1,
1657 PyUnicode_4BYTE_DATA(from) + from_start,
1658 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1659 PyUnicode_1BYTE_DATA(to) + to_start
1660 );
1661 }
1662 else if (from_kind == PyUnicode_4BYTE_KIND
1663 && to_kind == PyUnicode_2BYTE_KIND)
1664 {
1665 _PyUnicode_CONVERT_BYTES(
1666 Py_UCS4, Py_UCS2,
1667 PyUnicode_4BYTE_DATA(from) + from_start,
1668 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1669 PyUnicode_2BYTE_DATA(to) + to_start
1670 );
1671 }
1672 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001673 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001674 }
1675 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001676 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001677 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001678 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001679 Py_ssize_t i;
1680
Victor Stinnera0702ab2011-09-29 14:14:38 +02001681 for (i=0; i < how_many; i++) {
1682 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001683 if (ch > to_maxchar)
1684 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001685 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1686 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001687 }
1688 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001689 return 0;
1690}
1691
Victor Stinnerd3f08822012-05-29 12:57:52 +02001692void
1693_PyUnicode_FastCopyCharacters(
1694 PyObject *to, Py_ssize_t to_start,
1695 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001696{
1697 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1698}
1699
1700Py_ssize_t
1701PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1702 PyObject *from, Py_ssize_t from_start,
1703 Py_ssize_t how_many)
1704{
1705 int err;
1706
1707 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1708 PyErr_BadInternalCall();
1709 return -1;
1710 }
1711
Benjamin Petersonbac79492012-01-14 13:34:47 -05001712 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001713 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001714 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001715 return -1;
1716
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001717 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001718 PyErr_SetString(PyExc_IndexError, "string index out of range");
1719 return -1;
1720 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001721 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001722 PyErr_SetString(PyExc_IndexError, "string index out of range");
1723 return -1;
1724 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001725 if (how_many < 0) {
1726 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1727 return -1;
1728 }
1729 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001730 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1731 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001732 "Cannot write %zi characters at %zi "
1733 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001734 how_many, to_start, PyUnicode_GET_LENGTH(to));
1735 return -1;
1736 }
1737
1738 if (how_many == 0)
1739 return 0;
1740
Victor Stinner488fa492011-12-12 00:01:39 +01001741 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 return -1;
1743
1744 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1745 if (err) {
1746 PyErr_Format(PyExc_SystemError,
1747 "Cannot copy %s characters "
1748 "into a string of %s characters",
1749 unicode_kind_name(from),
1750 unicode_kind_name(to));
1751 return -1;
1752 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001753 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754}
1755
Victor Stinner17222162011-09-28 22:15:37 +02001756/* Find the maximum code point and count the number of surrogate pairs so a
1757 correct string length can be computed before converting a string to UCS4.
1758 This function counts single surrogates as a character and not as a pair.
1759
1760 Return 0 on success, or -1 on error. */
1761static int
1762find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1763 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764{
1765 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001766 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767
Victor Stinnerc53be962011-10-02 21:33:54 +02001768 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 *num_surrogates = 0;
1770 *maxchar = 0;
1771
1772 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001774 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1775 && (iter+1) < end
1776 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1777 {
1778 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1779 ++(*num_surrogates);
1780 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 }
1782 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001784 {
1785 ch = *iter;
1786 iter++;
1787 }
1788 if (ch > *maxchar) {
1789 *maxchar = ch;
1790 if (*maxchar > MAX_UNICODE) {
1791 PyErr_Format(PyExc_ValueError,
Victor Stinner99768342021-03-17 21:46:53 +01001792 "character U+%x is not in range [U+0000; U+%x]",
1793 ch, MAX_UNICODE);
Victor Stinner8faf8212011-12-08 22:14:11 +01001794 return -1;
1795 }
1796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 }
1798 return 0;
1799}
1800
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001801int
1802_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803{
1804 wchar_t *end;
1805 Py_UCS4 maxchar = 0;
1806 Py_ssize_t num_surrogates;
1807#if SIZEOF_WCHAR_T == 2
1808 Py_ssize_t length_wo_surrogates;
1809#endif
1810
Georg Brandl7597add2011-10-05 16:36:47 +02001811 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001812 strings were created using _PyObject_New() and where no canonical
1813 representation (the str field) has been set yet aka strings
1814 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001815 assert(_PyUnicode_CHECK(unicode));
1816 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001818 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001819 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 /* Actually, it should neither be interned nor be anything else: */
1821 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001824 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001825 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827
1828 if (maxchar < 256) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001829 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001830 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 PyErr_NoMemory();
1832 return -1;
1833 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001834 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 _PyUnicode_WSTR(unicode), end,
1836 PyUnicode_1BYTE_DATA(unicode));
1837 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1838 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1839 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1840 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001841 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001842 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001843 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 }
1845 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001846 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001847 _PyUnicode_UTF8(unicode) = NULL;
1848 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001850 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 _PyUnicode_WSTR(unicode) = NULL;
1852 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1853 }
1854 /* In this case we might have to convert down from 4-byte native
1855 wchar_t to 2-byte unicode. */
1856 else if (maxchar < 65536) {
1857 assert(num_surrogates == 0 &&
1858 "FindMaxCharAndNumSurrogatePairs() messed up");
1859
Victor Stinner506f5922011-09-28 22:34:18 +02001860#if SIZEOF_WCHAR_T == 2
1861 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001862 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001863 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1864 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1865 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001866 _PyUnicode_UTF8(unicode) = NULL;
1867 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001868#else
1869 /* sizeof(wchar_t) == 4 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001870 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
Victor Stinner506f5922011-09-28 22:34:18 +02001871 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001872 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001873 PyErr_NoMemory();
1874 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 }
Victor Stinner506f5922011-09-28 22:34:18 +02001876 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1877 _PyUnicode_WSTR(unicode), end,
1878 PyUnicode_2BYTE_DATA(unicode));
1879 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1880 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1881 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001882 _PyUnicode_UTF8(unicode) = NULL;
1883 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner32bd68c2020-12-01 10:37:39 +01001884 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinner506f5922011-09-28 22:34:18 +02001885 _PyUnicode_WSTR(unicode) = NULL;
1886 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1887#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 }
Ikko Ashimine38811d62020-11-10 14:57:34 +09001889 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 else {
1891#if SIZEOF_WCHAR_T == 2
1892 /* in case the native representation is 2-bytes, we need to allocate a
1893 new normalized 4-byte version. */
1894 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001895 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1896 PyErr_NoMemory();
1897 return -1;
1898 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001899 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001900 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 PyErr_NoMemory();
1902 return -1;
1903 }
1904 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1905 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001906 _PyUnicode_UTF8(unicode) = NULL;
1907 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001908 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1909 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001910 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001911 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 _PyUnicode_WSTR(unicode) = NULL;
1913 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1914#else
1915 assert(num_surrogates == 0);
1916
Victor Stinnerc3c74152011-10-02 20:39:55 +02001917 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001919 _PyUnicode_UTF8(unicode) = NULL;
1920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1922#endif
1923 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1924 }
1925 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001926 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927 return 0;
1928}
1929
Alexander Belopolsky40018472011-02-26 01:02:56 +00001930static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001931unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932{
Walter Dörwald16807132007-05-25 13:52:07 +00001933 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001934 case SSTATE_NOT_INTERNED:
1935 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001936
Benjamin Peterson29060642009-01-31 22:14:21 +00001937 case SSTATE_INTERNED_MORTAL:
Victor Stinnerea251802020-12-26 02:58:33 +01001938 {
1939 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner3549ca32020-07-03 16:59:12 +02001940 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1941 references (key and value) which were ignored by
1942 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1943 to prevent calling unicode_dealloc() again. Adjust refcnt after
1944 PyDict_DelItem(). */
1945 assert(Py_REFCNT(unicode) == 0);
1946 Py_SET_REFCNT(unicode, 3);
Victor Stinnerea251802020-12-26 02:58:33 +01001947 if (PyDict_DelItem(state->interned, unicode) != 0) {
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001948 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1949 NULL);
1950 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001951 assert(Py_REFCNT(unicode) == 1);
1952 Py_SET_REFCNT(unicode, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001953 break;
Victor Stinnerea251802020-12-26 02:58:33 +01001954 }
Walter Dörwald16807132007-05-25 13:52:07 +00001955
Benjamin Peterson29060642009-01-31 22:14:21 +00001956 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001957 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1958 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001959
Benjamin Peterson29060642009-01-31 22:14:21 +00001960 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001961 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001962 }
1963
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001965 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001966 }
1967 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001968 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001969 }
1970 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001971 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001974 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975}
1976
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001977#ifdef Py_DEBUG
1978static int
1979unicode_is_singleton(PyObject *unicode)
1980{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001981 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001982 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001983 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001984 }
Victor Stinner607b1022020-05-05 18:50:30 +02001985 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001986 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1987 {
1988 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001989 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001990 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001991 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001992 }
1993 return 0;
1994}
1995#endif
1996
Alexander Belopolsky40018472011-02-26 01:02:56 +00001997static int
Victor Stinner488fa492011-12-12 00:01:39 +01001998unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999{
Victor Stinner488fa492011-12-12 00:01:39 +01002000 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002001 if (Py_REFCNT(unicode) != 1)
2002 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002003 if (_PyUnicode_HASH(unicode) != -1)
2004 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002005 if (PyUnicode_CHECK_INTERNED(unicode))
2006 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002007 if (!PyUnicode_CheckExact(unicode))
2008 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002009#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002010 /* singleton refcount is greater than 1 */
2011 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002012#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002013 return 1;
2014}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002015
Victor Stinnerfe226c02011-10-03 03:52:20 +02002016static int
2017unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2018{
2019 PyObject *unicode;
2020 Py_ssize_t old_length;
2021
2022 assert(p_unicode != NULL);
2023 unicode = *p_unicode;
2024
2025 assert(unicode != NULL);
2026 assert(PyUnicode_Check(unicode));
2027 assert(0 <= length);
2028
Victor Stinner910337b2011-10-03 03:20:16 +02002029 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002030 old_length = PyUnicode_WSTR_LENGTH(unicode);
2031 else
2032 old_length = PyUnicode_GET_LENGTH(unicode);
2033 if (old_length == length)
2034 return 0;
2035
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002036 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002037 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002038 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002039 return 0;
2040 }
2041
Victor Stinner488fa492011-12-12 00:01:39 +01002042 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002043 PyObject *copy = resize_copy(unicode, length);
2044 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002045 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002046 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002047 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002048 }
2049
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002051 PyObject *new_unicode = resize_compact(unicode, length);
2052 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002053 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002054 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002055 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002056 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002057 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002058}
2059
Alexander Belopolsky40018472011-02-26 01:02:56 +00002060int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002061PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002062{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002063 PyObject *unicode;
2064 if (p_unicode == NULL) {
2065 PyErr_BadInternalCall();
2066 return -1;
2067 }
2068 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002069 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002070 {
2071 PyErr_BadInternalCall();
2072 return -1;
2073 }
2074 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002075}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002076
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002077/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002078
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002079 WARNING: The function doesn't copy the terminating null character and
2080 doesn't check the maximum character (may write a latin1 character in an
2081 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002082static void
2083unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2084 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002085{
2086 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002087 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002088 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002089
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002090 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002091 switch (kind) {
2092 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002093#ifdef Py_DEBUG
2094 if (PyUnicode_IS_ASCII(unicode)) {
2095 Py_UCS4 maxchar = ucs1lib_find_max_char(
2096 (const Py_UCS1*)str,
2097 (const Py_UCS1*)str + len);
2098 assert(maxchar < 128);
2099 }
2100#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002101 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002102 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103 }
2104 case PyUnicode_2BYTE_KIND: {
2105 Py_UCS2 *start = (Py_UCS2 *)data + index;
2106 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002107
Victor Stinner184252a2012-06-16 02:57:41 +02002108 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 *ucs2 = (Py_UCS2)*str;
2110
2111 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002112 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002113 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002114 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002115 Py_UCS4 *start = (Py_UCS4 *)data + index;
2116 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002117
Victor Stinner184252a2012-06-16 02:57:41 +02002118 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002119 *ucs4 = (Py_UCS4)*str;
2120
2121 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002122 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002123 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002124 default:
2125 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 }
2127}
2128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002130get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002132 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002133
Victor Stinner2f9ada92020-06-24 02:22:21 +02002134 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002135 if (unicode) {
2136 Py_INCREF(unicode);
2137 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 }
Victor Stinner607b1022020-05-05 18:50:30 +02002139
2140 unicode = PyUnicode_New(1, ch);
2141 if (!unicode) {
2142 return NULL;
2143 }
2144
2145 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2146 assert(_PyUnicode_CheckConsistency(unicode, 1));
2147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002149 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002150 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151}
2152
Victor Stinner985a82a2014-01-03 12:53:47 +01002153static PyObject*
2154unicode_char(Py_UCS4 ch)
2155{
2156 PyObject *unicode;
2157
2158 assert(ch <= MAX_UNICODE);
2159
Victor Stinner2f9ada92020-06-24 02:22:21 +02002160 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002161 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002162 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002163
Victor Stinner985a82a2014-01-03 12:53:47 +01002164 unicode = PyUnicode_New(1, ch);
2165 if (unicode == NULL)
2166 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002167
2168 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2169 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002170 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002171 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002172 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2173 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2174 }
2175 assert(_PyUnicode_CheckConsistency(unicode, 1));
2176 return unicode;
2177}
2178
Alexander Belopolsky40018472011-02-26 01:02:56 +00002179PyObject *
2180PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002182 if (u == NULL) {
2183 if (size > 0) {
2184 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2185 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2186 "use PyUnicode_New() instead", 1) < 0) {
2187 return NULL;
2188 }
2189 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002190 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002191 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002192
2193 if (size < 0) {
2194 PyErr_BadInternalCall();
2195 return NULL;
2196 }
2197
2198 return PyUnicode_FromWideChar(u, size);
2199}
2200
2201PyObject *
2202PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2203{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002204 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 Py_UCS4 maxchar = 0;
2206 Py_ssize_t num_surrogates;
2207
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002208 if (u == NULL && size != 0) {
2209 PyErr_BadInternalCall();
2210 return NULL;
2211 }
2212
2213 if (size == -1) {
2214 size = wcslen(u);
2215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002217 /* If the Unicode data is known at construction time, we can apply
2218 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002221 if (size == 0)
2222 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002223
Jakub Kulík9032cf52021-04-30 15:21:42 +02002224#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2225 /* Oracle Solaris uses non-Unicode internal wchar_t form for
2226 non-Unicode locales and hence needs conversion to UCS-4 first. */
2227 if (_Py_LocaleUsesNonUnicodeWchar()) {
2228 wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2229 if (!converted) {
2230 return NULL;
2231 }
2232 PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2233 PyMem_Free(converted);
2234 return unicode;
2235 }
2236#endif
2237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 /* Single character Unicode objects in the Latin-1 range are
2239 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002240 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 return get_latin1_char((unsigned char)*u);
2242
2243 /* If not empty and not single character, copy the Unicode data
2244 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002245 if (find_maxchar_surrogates(u, u + size,
2246 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 return NULL;
2248
Victor Stinner8faf8212011-12-08 22:14:11 +01002249 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 if (!unicode)
2251 return NULL;
2252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 switch (PyUnicode_KIND(unicode)) {
2254 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002255 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2257 break;
2258 case PyUnicode_2BYTE_KIND:
2259#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002260 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002262 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2264#endif
2265 break;
2266 case PyUnicode_4BYTE_KIND:
2267#if SIZEOF_WCHAR_T == 2
2268 /* This is the only case which has to process surrogates, thus
2269 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002270 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271#else
2272 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002273 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274#endif
2275 break;
2276 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002277 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002280 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281}
2282
Alexander Belopolsky40018472011-02-26 01:02:56 +00002283PyObject *
2284PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002285{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002286 if (size < 0) {
2287 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002288 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002289 return NULL;
2290 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002291 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002292 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002293 }
2294 else {
2295 if (size > 0) {
2296 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2297 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2298 "use PyUnicode_New() instead", 1) < 0) {
2299 return NULL;
2300 }
2301 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002302 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002303 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002304}
2305
Alexander Belopolsky40018472011-02-26 01:02:56 +00002306PyObject *
2307PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002308{
2309 size_t size = strlen(u);
2310 if (size > PY_SSIZE_T_MAX) {
2311 PyErr_SetString(PyExc_OverflowError, "input too long");
2312 return NULL;
2313 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002314 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002315}
2316
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002317
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002318PyObject *
2319_PyUnicode_FromId(_Py_Identifier *id)
2320{
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002321 PyInterpreterState *interp = _PyInterpreterState_GET();
2322 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2323
Pablo Galindoa6d63a22020-12-29 00:28:09 +00002324 Py_ssize_t index = _Py_atomic_size_get(&id->index);
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002325 if (index < 0) {
2326 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2327
2328 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2329 // Check again to detect concurrent access. Another thread can have
2330 // initialized the index while this thread waited for the lock.
2331 index = _Py_atomic_size_get(&id->index);
2332 if (index < 0) {
2333 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2334 index = rt_ids->next_index;
2335 rt_ids->next_index++;
2336 _Py_atomic_size_set(&id->index, index);
2337 }
2338 PyThread_release_lock(rt_ids->lock);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002339 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002340 assert(index >= 0);
Victor Stinner297257f2020-06-02 14:39:45 +02002341
2342 PyObject *obj;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002343 if (index < ids->size) {
2344 obj = ids->array[index];
2345 if (obj) {
2346 // Return a borrowed reference
2347 return obj;
2348 }
2349 }
2350
2351 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
Victor Stinner297257f2020-06-02 14:39:45 +02002352 NULL, NULL);
2353 if (!obj) {
2354 return NULL;
2355 }
2356 PyUnicode_InternInPlace(&obj);
2357
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002358 if (index >= ids->size) {
2359 // Overallocate to reduce the number of realloc
2360 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2361 Py_ssize_t item_size = sizeof(ids->array[0]);
2362 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2363 if (new_array == NULL) {
2364 PyErr_NoMemory();
2365 return NULL;
2366 }
2367 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2368 ids->array = new_array;
2369 ids->size = new_size;
2370 }
2371
2372 // The array stores a strong reference
2373 ids->array[index] = obj;
2374
2375 // Return a borrowed reference
2376 return obj;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002377}
2378
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002379
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002380static void
Victor Stinnerf4507232020-12-26 20:26:08 +01002381unicode_clear_identifiers(struct _Py_unicode_state *state)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002382{
Victor Stinnerf4507232020-12-26 20:26:08 +01002383 struct _Py_unicode_ids *ids = &state->ids;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002384 for (Py_ssize_t i=0; i < ids->size; i++) {
2385 Py_XDECREF(ids->array[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002386 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002387 ids->size = 0;
2388 PyMem_Free(ids->array);
2389 ids->array = NULL;
2390 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2391 // after Py_Finalize().
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002392}
2393
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002394
Benjamin Peterson0df54292012-03-26 14:50:32 -04002395/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002396
Victor Stinnerd3f08822012-05-29 12:57:52 +02002397PyObject*
2398_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002399{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002400 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002401 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002402 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002403#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002404 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002405#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002406 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002407 }
Victor Stinner785938e2011-12-11 20:09:03 +01002408 unicode = PyUnicode_New(size, 127);
2409 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002410 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002411 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2412 assert(_PyUnicode_CheckConsistency(unicode, 1));
2413 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002414}
2415
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002416static Py_UCS4
2417kind_maxchar_limit(unsigned int kind)
2418{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002419 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002420 case PyUnicode_1BYTE_KIND:
2421 return 0x80;
2422 case PyUnicode_2BYTE_KIND:
2423 return 0x100;
2424 case PyUnicode_4BYTE_KIND:
2425 return 0x10000;
2426 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002427 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002428 }
2429}
2430
Victor Stinner702c7342011-10-05 13:50:52 +02002431static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002432_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002435 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002436
Victor Stinner2f9ada92020-06-24 02:22:21 +02002437 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002438 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002439 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002440 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002441 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002442 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002443 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002444
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002445 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002446 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 if (!res)
2448 return NULL;
2449 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002450 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002452}
2453
Victor Stinnere57b1c02011-09-28 22:20:48 +02002454static PyObject*
2455_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456{
2457 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002458 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002459
Serhiy Storchaka678db842013-01-26 12:16:36 +02002460 if (size == 0)
2461 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002462 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002463 if (size == 1)
2464 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002465
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002466 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002467 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 if (!res)
2469 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002470 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002472 else {
2473 _PyUnicode_CONVERT_BYTES(
2474 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2475 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002476 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 return res;
2478}
2479
Victor Stinnere57b1c02011-09-28 22:20:48 +02002480static PyObject*
2481_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002482{
2483 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002484 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002485
Serhiy Storchaka678db842013-01-26 12:16:36 +02002486 if (size == 0)
2487 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002488 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002489 if (size == 1)
2490 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002491
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002492 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002493 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 if (!res)
2495 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002496 if (max_char < 256)
2497 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2498 PyUnicode_1BYTE_DATA(res));
2499 else if (max_char < 0x10000)
2500 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2501 PyUnicode_2BYTE_DATA(res));
2502 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002504 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 return res;
2506}
2507
2508PyObject*
2509PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2510{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002511 if (size < 0) {
2512 PyErr_SetString(PyExc_ValueError, "size must be positive");
2513 return NULL;
2514 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002515 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002517 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002519 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002521 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002522 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002523 PyErr_SetString(PyExc_SystemError, "invalid kind");
2524 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526}
2527
Victor Stinnerece58de2012-04-23 23:36:38 +02002528Py_UCS4
2529_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2530{
2531 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002532 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002533
2534 assert(PyUnicode_IS_READY(unicode));
2535 assert(0 <= start);
2536 assert(end <= PyUnicode_GET_LENGTH(unicode));
2537 assert(start <= end);
2538
2539 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2540 return PyUnicode_MAX_CHAR_VALUE(unicode);
2541
2542 if (start == end)
2543 return 127;
2544
Victor Stinner94d558b2012-04-27 22:26:58 +02002545 if (PyUnicode_IS_ASCII(unicode))
2546 return 127;
2547
Victor Stinnerece58de2012-04-23 23:36:38 +02002548 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002549 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002550 endptr = (char *)startptr + end * kind;
2551 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002552 switch(kind) {
2553 case PyUnicode_1BYTE_KIND:
2554 return ucs1lib_find_max_char(startptr, endptr);
2555 case PyUnicode_2BYTE_KIND:
2556 return ucs2lib_find_max_char(startptr, endptr);
2557 case PyUnicode_4BYTE_KIND:
2558 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002559 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002560 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002561 }
2562}
2563
Victor Stinner25a4b292011-10-06 12:31:55 +02002564/* Ensure that a string uses the most efficient storage, if it is not the
2565 case: create a new string with of the right kind. Write NULL into *p_unicode
2566 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002567static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002568unicode_adjust_maxchar(PyObject **p_unicode)
2569{
2570 PyObject *unicode, *copy;
2571 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002572 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002573 unsigned int kind;
2574
2575 assert(p_unicode != NULL);
2576 unicode = *p_unicode;
2577 assert(PyUnicode_IS_READY(unicode));
2578 if (PyUnicode_IS_ASCII(unicode))
2579 return;
2580
2581 len = PyUnicode_GET_LENGTH(unicode);
2582 kind = PyUnicode_KIND(unicode);
2583 if (kind == PyUnicode_1BYTE_KIND) {
2584 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002585 max_char = ucs1lib_find_max_char(u, u + len);
2586 if (max_char >= 128)
2587 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002588 }
2589 else if (kind == PyUnicode_2BYTE_KIND) {
2590 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002591 max_char = ucs2lib_find_max_char(u, u + len);
2592 if (max_char >= 256)
2593 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002594 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002595 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002596 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002597 max_char = ucs4lib_find_max_char(u, u + len);
2598 if (max_char >= 0x10000)
2599 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002600 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002601 else
2602 Py_UNREACHABLE();
2603
Victor Stinner25a4b292011-10-06 12:31:55 +02002604 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002605 if (copy != NULL)
2606 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002607 Py_DECREF(unicode);
2608 *p_unicode = copy;
2609}
2610
Victor Stinner034f6cf2011-09-30 02:26:44 +02002611PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002612_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002613{
Victor Stinner87af4f22011-11-21 23:03:47 +01002614 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002615 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002616
Victor Stinner034f6cf2011-09-30 02:26:44 +02002617 if (!PyUnicode_Check(unicode)) {
2618 PyErr_BadInternalCall();
2619 return NULL;
2620 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002621 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002622 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002623
Victor Stinner87af4f22011-11-21 23:03:47 +01002624 length = PyUnicode_GET_LENGTH(unicode);
2625 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002626 if (!copy)
2627 return NULL;
2628 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2629
Christian Heimesf051e432016-09-13 20:22:02 +02002630 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002631 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002632 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002633 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002634}
2635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636
Victor Stinnerbc603d12011-10-02 01:00:40 +02002637/* Widen Unicode objects to larger buffers. Don't write terminating null
2638 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002640static void*
2641unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002643 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002644
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002645 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002646 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002647 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002648 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002649 if (!result)
2650 return PyErr_NoMemory();
2651 assert(skind == PyUnicode_1BYTE_KIND);
2652 _PyUnicode_CONVERT_BYTES(
2653 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002654 (const Py_UCS1 *)data,
2655 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002656 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002658 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002659 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002660 if (!result)
2661 return PyErr_NoMemory();
2662 if (skind == PyUnicode_2BYTE_KIND) {
2663 _PyUnicode_CONVERT_BYTES(
2664 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002665 (const Py_UCS2 *)data,
2666 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002667 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002669 else {
2670 assert(skind == PyUnicode_1BYTE_KIND);
2671 _PyUnicode_CONVERT_BYTES(
2672 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002673 (const Py_UCS1 *)data,
2674 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002675 result);
2676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002677 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002678 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002679 Py_UNREACHABLE();
2680 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682}
2683
2684static Py_UCS4*
2685as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2686 int copy_null)
2687{
2688 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002689 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 Py_ssize_t len, targetlen;
2691 if (PyUnicode_READY(string) == -1)
2692 return NULL;
2693 kind = PyUnicode_KIND(string);
2694 data = PyUnicode_DATA(string);
2695 len = PyUnicode_GET_LENGTH(string);
2696 targetlen = len;
2697 if (copy_null)
2698 targetlen++;
2699 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002700 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701 if (!target) {
2702 PyErr_NoMemory();
2703 return NULL;
2704 }
2705 }
2706 else {
2707 if (targetsize < targetlen) {
2708 PyErr_Format(PyExc_SystemError,
2709 "string is longer than the buffer");
2710 if (copy_null && 0 < targetsize)
2711 target[0] = 0;
2712 return NULL;
2713 }
2714 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002715 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002716 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002717 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002719 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002720 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002721 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2722 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002723 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002724 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002725 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002726 else {
2727 Py_UNREACHABLE();
2728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 if (copy_null)
2730 target[len] = 0;
2731 return target;
2732}
2733
2734Py_UCS4*
2735PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2736 int copy_null)
2737{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002738 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 PyErr_BadInternalCall();
2740 return NULL;
2741 }
2742 return as_ucs4(string, target, targetsize, copy_null);
2743}
2744
2745Py_UCS4*
2746PyUnicode_AsUCS4Copy(PyObject *string)
2747{
2748 return as_ucs4(string, NULL, 0, 1);
2749}
2750
Victor Stinner15a11362012-10-06 23:48:20 +02002751/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002752 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2753 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2754#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002755
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002756static int
2757unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2758 Py_ssize_t width, Py_ssize_t precision)
2759{
2760 Py_ssize_t length, fill, arglen;
2761 Py_UCS4 maxchar;
2762
2763 if (PyUnicode_READY(str) == -1)
2764 return -1;
2765
2766 length = PyUnicode_GET_LENGTH(str);
2767 if ((precision == -1 || precision >= length)
2768 && width <= length)
2769 return _PyUnicodeWriter_WriteStr(writer, str);
2770
2771 if (precision != -1)
2772 length = Py_MIN(precision, length);
2773
2774 arglen = Py_MAX(length, width);
2775 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2776 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2777 else
2778 maxchar = writer->maxchar;
2779
2780 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2781 return -1;
2782
2783 if (width > length) {
2784 fill = width - length;
2785 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2786 return -1;
2787 writer->pos += fill;
2788 }
2789
2790 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2791 str, 0, length);
2792 writer->pos += length;
2793 return 0;
2794}
2795
2796static int
Victor Stinner998b8062018-09-12 00:23:25 +02002797unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002798 Py_ssize_t width, Py_ssize_t precision)
2799{
2800 /* UTF-8 */
2801 Py_ssize_t length;
2802 PyObject *unicode;
2803 int res;
2804
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002805 if (precision == -1) {
2806 length = strlen(str);
2807 }
2808 else {
2809 length = 0;
2810 while (length < precision && str[length]) {
2811 length++;
2812 }
2813 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002814 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2815 if (unicode == NULL)
2816 return -1;
2817
2818 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2819 Py_DECREF(unicode);
2820 return res;
2821}
2822
Victor Stinner96865452011-03-01 23:44:09 +00002823static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002824unicode_fromformat_arg(_PyUnicodeWriter *writer,
2825 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002826{
Victor Stinnere215d962012-10-06 23:03:36 +02002827 const char *p;
2828 Py_ssize_t len;
2829 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002830 Py_ssize_t width;
2831 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002832 int longflag;
2833 int longlongflag;
2834 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002835 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002836
2837 p = f;
2838 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002839 zeropad = 0;
2840 if (*f == '0') {
2841 zeropad = 1;
2842 f++;
2843 }
Victor Stinner96865452011-03-01 23:44:09 +00002844
2845 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002846 width = -1;
2847 if (Py_ISDIGIT((unsigned)*f)) {
2848 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002849 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002850 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002851 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002852 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002853 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002854 return NULL;
2855 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002856 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002857 f++;
2858 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002859 }
2860 precision = -1;
2861 if (*f == '.') {
2862 f++;
2863 if (Py_ISDIGIT((unsigned)*f)) {
2864 precision = (*f - '0');
2865 f++;
2866 while (Py_ISDIGIT((unsigned)*f)) {
2867 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2868 PyErr_SetString(PyExc_ValueError,
2869 "precision too big");
2870 return NULL;
2871 }
2872 precision = (precision * 10) + (*f - '0');
2873 f++;
2874 }
2875 }
Victor Stinner96865452011-03-01 23:44:09 +00002876 if (*f == '%') {
2877 /* "%.3%s" => f points to "3" */
2878 f--;
2879 }
2880 }
2881 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002882 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002883 f--;
2884 }
Victor Stinner96865452011-03-01 23:44:09 +00002885
2886 /* Handle %ld, %lu, %lld and %llu. */
2887 longflag = 0;
2888 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002889 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002890 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002891 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002892 longflag = 1;
2893 ++f;
2894 }
Victor Stinner96865452011-03-01 23:44:09 +00002895 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002896 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002897 longlongflag = 1;
2898 f += 2;
2899 }
Victor Stinner96865452011-03-01 23:44:09 +00002900 }
2901 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002902 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002903 size_tflag = 1;
2904 ++f;
2905 }
Victor Stinnere215d962012-10-06 23:03:36 +02002906
2907 if (f[1] == '\0')
2908 writer->overallocate = 0;
2909
2910 switch (*f) {
2911 case 'c':
2912 {
2913 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002914 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002915 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002916 "character argument not in range(0x110000)");
2917 return NULL;
2918 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002919 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002921 break;
2922 }
2923
2924 case 'i':
2925 case 'd':
2926 case 'u':
2927 case 'x':
2928 {
2929 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002930 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002931 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002932
2933 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002934 if (longflag) {
2935 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2936 }
2937 else if (longlongflag) {
2938 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2939 }
2940 else if (size_tflag) {
2941 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2942 }
2943 else {
2944 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2945 }
Victor Stinnere215d962012-10-06 23:03:36 +02002946 }
2947 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002948 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002949 }
2950 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002951 if (longflag) {
2952 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2953 }
2954 else if (longlongflag) {
2955 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2956 }
2957 else if (size_tflag) {
2958 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2959 }
2960 else {
2961 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2962 }
Victor Stinnere215d962012-10-06 23:03:36 +02002963 }
2964 assert(len >= 0);
2965
Victor Stinnere215d962012-10-06 23:03:36 +02002966 if (precision < len)
2967 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002968
2969 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002970 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2971 return NULL;
2972
Victor Stinnere215d962012-10-06 23:03:36 +02002973 if (width > precision) {
2974 Py_UCS4 fillchar;
2975 fill = width - precision;
2976 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002977 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2978 return NULL;
2979 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002980 }
Victor Stinner15a11362012-10-06 23:48:20 +02002981 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002982 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002983 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2984 return NULL;
2985 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002986 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002987
Victor Stinner4a587072013-11-19 12:54:53 +01002988 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2989 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002990 break;
2991 }
2992
2993 case 'p':
2994 {
2995 char number[MAX_LONG_LONG_CHARS];
2996
2997 len = sprintf(number, "%p", va_arg(*vargs, void*));
2998 assert(len >= 0);
2999
3000 /* %p is ill-defined: ensure leading 0x. */
3001 if (number[1] == 'X')
3002 number[1] = 'x';
3003 else if (number[1] != 'x') {
3004 memmove(number + 2, number,
3005 strlen(number) + 1);
3006 number[0] = '0';
3007 number[1] = 'x';
3008 len += 2;
3009 }
3010
Victor Stinner4a587072013-11-19 12:54:53 +01003011 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003012 return NULL;
3013 break;
3014 }
3015
3016 case 's':
3017 {
3018 /* UTF-8 */
3019 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02003020 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003021 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003022 break;
3023 }
3024
3025 case 'U':
3026 {
3027 PyObject *obj = va_arg(*vargs, PyObject *);
3028 assert(obj && _PyUnicode_CHECK(obj));
3029
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003030 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003031 return NULL;
3032 break;
3033 }
3034
3035 case 'V':
3036 {
3037 PyObject *obj = va_arg(*vargs, PyObject *);
3038 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02003039 if (obj) {
3040 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003041 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003042 return NULL;
3043 }
3044 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003045 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02003046 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003047 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003048 }
3049 break;
3050 }
3051
3052 case 'S':
3053 {
3054 PyObject *obj = va_arg(*vargs, PyObject *);
3055 PyObject *str;
3056 assert(obj);
3057 str = PyObject_Str(obj);
3058 if (!str)
3059 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003060 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003061 Py_DECREF(str);
3062 return NULL;
3063 }
3064 Py_DECREF(str);
3065 break;
3066 }
3067
3068 case 'R':
3069 {
3070 PyObject *obj = va_arg(*vargs, PyObject *);
3071 PyObject *repr;
3072 assert(obj);
3073 repr = PyObject_Repr(obj);
3074 if (!repr)
3075 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003076 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003077 Py_DECREF(repr);
3078 return NULL;
3079 }
3080 Py_DECREF(repr);
3081 break;
3082 }
3083
3084 case 'A':
3085 {
3086 PyObject *obj = va_arg(*vargs, PyObject *);
3087 PyObject *ascii;
3088 assert(obj);
3089 ascii = PyObject_ASCII(obj);
3090 if (!ascii)
3091 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003092 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003093 Py_DECREF(ascii);
3094 return NULL;
3095 }
3096 Py_DECREF(ascii);
3097 break;
3098 }
3099
3100 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003101 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003102 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003103 break;
3104
3105 default:
3106 /* if we stumble upon an unknown formatting code, copy the rest
3107 of the format string to the output string. (we cannot just
3108 skip the code, since there's no way to know what's in the
3109 argument list) */
3110 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003111 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003112 return NULL;
3113 f = p+len;
3114 return f;
3115 }
3116
3117 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003118 return f;
3119}
3120
Walter Dörwaldd2034312007-05-18 16:29:38 +00003121PyObject *
3122PyUnicode_FromFormatV(const char *format, va_list vargs)
3123{
Victor Stinnere215d962012-10-06 23:03:36 +02003124 va_list vargs2;
3125 const char *f;
3126 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003127
Victor Stinner8f674cc2013-04-17 23:02:17 +02003128 _PyUnicodeWriter_Init(&writer);
3129 writer.min_length = strlen(format) + 100;
3130 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003131
Benjamin Peterson0c212142016-09-20 20:39:33 -07003132 // Copy varags to be able to pass a reference to a subfunction.
3133 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003134
3135 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003136 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003137 f = unicode_fromformat_arg(&writer, f, &vargs2);
3138 if (f == NULL)
3139 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003141 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003142 const char *p;
3143 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003144
Victor Stinnere215d962012-10-06 23:03:36 +02003145 p = f;
3146 do
3147 {
3148 if ((unsigned char)*p > 127) {
3149 PyErr_Format(PyExc_ValueError,
3150 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3151 "string, got a non-ASCII byte: 0x%02x",
3152 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003153 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003154 }
3155 p++;
3156 }
3157 while (*p != '\0' && *p != '%');
3158 len = p - f;
3159
3160 if (*p == '\0')
3161 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003162
3163 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003164 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003165
3166 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003167 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003168 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003169 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003170 return _PyUnicodeWriter_Finish(&writer);
3171
3172 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003173 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003174 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003175 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003176}
3177
Walter Dörwaldd2034312007-05-18 16:29:38 +00003178PyObject *
3179PyUnicode_FromFormat(const char *format, ...)
3180{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003181 PyObject* ret;
3182 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003183
3184#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003185 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003186#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003187 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003188#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003189 ret = PyUnicode_FromFormatV(format, vargs);
3190 va_end(vargs);
3191 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003192}
3193
Serhiy Storchakac46db922018-10-23 22:58:24 +03003194static Py_ssize_t
3195unicode_get_widechar_size(PyObject *unicode)
3196{
3197 Py_ssize_t res;
3198
3199 assert(unicode != NULL);
3200 assert(_PyUnicode_CHECK(unicode));
3201
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003202#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchakac46db922018-10-23 22:58:24 +03003203 if (_PyUnicode_WSTR(unicode) != NULL) {
3204 return PyUnicode_WSTR_LENGTH(unicode);
3205 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003206#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003207 assert(PyUnicode_IS_READY(unicode));
3208
3209 res = _PyUnicode_LENGTH(unicode);
3210#if SIZEOF_WCHAR_T == 2
3211 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3212 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3213 const Py_UCS4 *end = s + res;
3214 for (; s < end; ++s) {
3215 if (*s > 0xFFFF) {
3216 ++res;
3217 }
3218 }
3219 }
3220#endif
3221 return res;
3222}
3223
3224static void
3225unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3226{
Serhiy Storchakac46db922018-10-23 22:58:24 +03003227 assert(unicode != NULL);
3228 assert(_PyUnicode_CHECK(unicode));
3229
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003230#if USE_UNICODE_WCHAR_CACHE
3231 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003232 if (wstr != NULL) {
3233 memcpy(w, wstr, size * sizeof(wchar_t));
3234 return;
3235 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003236#else /* USE_UNICODE_WCHAR_CACHE */
3237 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3238 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3239 return;
3240 }
3241#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003242 assert(PyUnicode_IS_READY(unicode));
3243
3244 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3245 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3246 for (; size--; ++s, ++w) {
3247 *w = *s;
3248 }
3249 }
3250 else {
3251#if SIZEOF_WCHAR_T == 4
3252 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3253 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3254 for (; size--; ++s, ++w) {
3255 *w = *s;
3256 }
3257#else
3258 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3259 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3260 for (; size--; ++s, ++w) {
3261 Py_UCS4 ch = *s;
3262 if (ch > 0xFFFF) {
3263 assert(ch <= MAX_UNICODE);
3264 /* encode surrogate pair in this case */
3265 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3266 if (!size--)
3267 break;
3268 *w = Py_UNICODE_LOW_SURROGATE(ch);
3269 }
3270 else {
3271 *w = ch;
3272 }
3273 }
3274#endif
3275 }
3276}
3277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003278#ifdef HAVE_WCHAR_H
3279
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003280/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003281
Victor Stinnerd88d9832011-09-06 02:00:05 +02003282 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003283 character) required to convert the unicode object. Ignore size argument.
3284
Victor Stinnerd88d9832011-09-06 02:00:05 +02003285 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003286 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003287 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003288Py_ssize_t
3289PyUnicode_AsWideChar(PyObject *unicode,
3290 wchar_t *w,
3291 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003292{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003293 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003294
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003295 if (unicode == NULL) {
3296 PyErr_BadInternalCall();
3297 return -1;
3298 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003299 if (!PyUnicode_Check(unicode)) {
3300 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003301 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003302 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003303
3304 res = unicode_get_widechar_size(unicode);
3305 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003306 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003307 }
3308
3309 if (size > res) {
3310 size = res + 1;
3311 }
3312 else {
3313 res = size;
3314 }
3315 unicode_copy_as_widechar(unicode, w, size);
Jakub Kulík9032cf52021-04-30 15:21:42 +02003316
3317#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3318 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3319 non-Unicode locales and hence needs conversion first. */
3320 if (_Py_LocaleUsesNonUnicodeWchar()) {
3321 if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3322 return -1;
3323 }
3324 }
3325#endif
3326
Serhiy Storchakac46db922018-10-23 22:58:24 +03003327 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003328}
3329
Victor Stinner137c34c2010-09-29 10:25:54 +00003330wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003331PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003332 Py_ssize_t *size)
3333{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003334 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003335 Py_ssize_t buflen;
3336
3337 if (unicode == NULL) {
3338 PyErr_BadInternalCall();
3339 return NULL;
3340 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003341 if (!PyUnicode_Check(unicode)) {
3342 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003343 return NULL;
3344 }
3345
Serhiy Storchakac46db922018-10-23 22:58:24 +03003346 buflen = unicode_get_widechar_size(unicode);
3347 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003348 if (buffer == NULL) {
3349 PyErr_NoMemory();
3350 return NULL;
3351 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003352 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
Jakub Kulík9032cf52021-04-30 15:21:42 +02003353
3354#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3355 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3356 non-Unicode locales and hence needs conversion first. */
3357 if (_Py_LocaleUsesNonUnicodeWchar()) {
3358 if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3359 return NULL;
3360 }
3361 }
3362#endif
3363
Serhiy Storchakac46db922018-10-23 22:58:24 +03003364 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003365 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003366 }
3367 else if (wcslen(buffer) != (size_t)buflen) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01003368 PyMem_Free(buffer);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003369 PyErr_SetString(PyExc_ValueError,
3370 "embedded null character");
3371 return NULL;
3372 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003373 return buffer;
3374}
3375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003376#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003378int
3379_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3380{
3381 wchar_t **p = (wchar_t **)ptr;
3382 if (obj == NULL) {
3383#if !USE_UNICODE_WCHAR_CACHE
3384 PyMem_Free(*p);
3385#endif /* USE_UNICODE_WCHAR_CACHE */
3386 *p = NULL;
3387 return 1;
3388 }
3389 if (PyUnicode_Check(obj)) {
3390#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003391 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3392 if (*p == NULL) {
3393 return 0;
3394 }
3395 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003396#else /* USE_UNICODE_WCHAR_CACHE */
3397 *p = PyUnicode_AsWideCharString(obj, NULL);
3398 if (*p == NULL) {
3399 return 0;
3400 }
3401 return Py_CLEANUP_SUPPORTED;
3402#endif /* USE_UNICODE_WCHAR_CACHE */
3403 }
3404 PyErr_Format(PyExc_TypeError,
3405 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003406 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003407 return 0;
3408}
3409
3410int
3411_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3412{
3413 wchar_t **p = (wchar_t **)ptr;
3414 if (obj == NULL) {
3415#if !USE_UNICODE_WCHAR_CACHE
3416 PyMem_Free(*p);
3417#endif /* USE_UNICODE_WCHAR_CACHE */
3418 *p = NULL;
3419 return 1;
3420 }
3421 if (obj == Py_None) {
3422 *p = NULL;
3423 return 1;
3424 }
3425 if (PyUnicode_Check(obj)) {
3426#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003427 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3428 if (*p == NULL) {
3429 return 0;
3430 }
3431 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003432#else /* USE_UNICODE_WCHAR_CACHE */
3433 *p = PyUnicode_AsWideCharString(obj, NULL);
3434 if (*p == NULL) {
3435 return 0;
3436 }
3437 return Py_CLEANUP_SUPPORTED;
3438#endif /* USE_UNICODE_WCHAR_CACHE */
3439 }
3440 PyErr_Format(PyExc_TypeError,
3441 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003442 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003443 return 0;
3444}
3445
Alexander Belopolsky40018472011-02-26 01:02:56 +00003446PyObject *
3447PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003448{
Victor Stinner8faf8212011-12-08 22:14:11 +01003449 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003450 PyErr_SetString(PyExc_ValueError,
3451 "chr() arg not in range(0x110000)");
3452 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003453 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003454
Victor Stinner985a82a2014-01-03 12:53:47 +01003455 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003456}
3457
Alexander Belopolsky40018472011-02-26 01:02:56 +00003458PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003459PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003461 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003462 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003463 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003464 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003465 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003466 Py_INCREF(obj);
3467 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003468 }
3469 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 /* For a Unicode subtype that's not a Unicode object,
3471 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003472 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003473 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003474 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003475 "Can't convert '%.100s' object to str implicitly",
3476 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003477 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003478}
3479
Alexander Belopolsky40018472011-02-26 01:02:56 +00003480PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003481PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003482 const char *encoding,
3483 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003484{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003485 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003486 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003487
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003489 PyErr_BadInternalCall();
3490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003492
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003493 /* Decoding bytes objects is the most common case and should be fast */
3494 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003495 if (PyBytes_GET_SIZE(obj) == 0) {
3496 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3497 return NULL;
3498 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003499 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003500 }
3501 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003502 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3503 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003504 }
3505
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003506 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003507 PyErr_SetString(PyExc_TypeError,
3508 "decoding str is not supported");
3509 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003510 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003511
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003512 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3513 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3514 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003515 "decoding to str: need a bytes-like object, %.80s found",
3516 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003517 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003518 }
Tim Petersced69f82003-09-16 20:30:58 +00003519
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003520 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003521 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003522 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3523 return NULL;
3524 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003525 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003527
Serhiy Storchaka05997252013-01-26 12:14:02 +02003528 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003529 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003530 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531}
3532
Victor Stinnerebe17e02016-10-12 13:57:45 +02003533/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3534 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3535 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003536int
3537_Py_normalize_encoding(const char *encoding,
3538 char *lower,
3539 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003541 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003542 char *l;
3543 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003544 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545
Victor Stinner942889a2016-09-05 15:40:10 -07003546 assert(encoding != NULL);
3547
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003548 e = encoding;
3549 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003550 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003551 punct = 0;
3552 while (1) {
3553 char c = *e;
3554 if (c == 0) {
3555 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003556 }
Victor Stinner942889a2016-09-05 15:40:10 -07003557
3558 if (Py_ISALNUM(c) || c == '.') {
3559 if (punct && l != lower) {
3560 if (l == l_end) {
3561 return 0;
3562 }
3563 *l++ = '_';
3564 }
3565 punct = 0;
3566
3567 if (l == l_end) {
3568 return 0;
3569 }
3570 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003571 }
3572 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003573 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003574 }
Victor Stinner942889a2016-09-05 15:40:10 -07003575
3576 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003577 }
3578 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003579 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003580}
3581
Alexander Belopolsky40018472011-02-26 01:02:56 +00003582PyObject *
3583PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003584 Py_ssize_t size,
3585 const char *encoding,
3586 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003587{
3588 PyObject *buffer = NULL, *unicode;
3589 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003590 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3591
Victor Stinner22eb6892019-06-26 00:51:05 +02003592 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3593 return NULL;
3594 }
3595
Victor Stinnered076ed2019-06-26 01:49:32 +02003596 if (size == 0) {
3597 _Py_RETURN_UNICODE_EMPTY();
3598 }
3599
Victor Stinner942889a2016-09-05 15:40:10 -07003600 if (encoding == NULL) {
3601 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3602 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003603
Fred Drakee4315f52000-05-09 19:53:39 +00003604 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003605 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3606 char *lower = buflower;
3607
3608 /* Fast paths */
3609 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3610 lower += 3;
3611 if (*lower == '_') {
3612 /* Match "utf8" and "utf_8" */
3613 lower++;
3614 }
3615
3616 if (lower[0] == '8' && lower[1] == 0) {
3617 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3618 }
3619 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3620 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3621 }
3622 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3623 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3624 }
3625 }
3626 else {
3627 if (strcmp(lower, "ascii") == 0
3628 || strcmp(lower, "us_ascii") == 0) {
3629 return PyUnicode_DecodeASCII(s, size, errors);
3630 }
Steve Dowercc16be82016-09-08 10:35:16 -07003631 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003632 else if (strcmp(lower, "mbcs") == 0) {
3633 return PyUnicode_DecodeMBCS(s, size, errors);
3634 }
3635 #endif
3636 else if (strcmp(lower, "latin1") == 0
3637 || strcmp(lower, "latin_1") == 0
3638 || strcmp(lower, "iso_8859_1") == 0
3639 || strcmp(lower, "iso8859_1") == 0) {
3640 return PyUnicode_DecodeLatin1(s, size, errors);
3641 }
3642 }
Victor Stinner37296e82010-06-10 13:36:23 +00003643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644
3645 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003646 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003647 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003648 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003649 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 if (buffer == NULL)
3651 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003652 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 if (unicode == NULL)
3654 goto onError;
3655 if (!PyUnicode_Check(unicode)) {
3656 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003657 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003658 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003659 encoding,
3660 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 Py_DECREF(unicode);
3662 goto onError;
3663 }
3664 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003665 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003666
Benjamin Peterson29060642009-01-31 22:14:21 +00003667 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 Py_XDECREF(buffer);
3669 return NULL;
3670}
3671
Alexander Belopolsky40018472011-02-26 01:02:56 +00003672PyObject *
3673PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003674 const char *encoding,
3675 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003676{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003677 if (!PyUnicode_Check(unicode)) {
3678 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003679 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003680 }
3681
Serhiy Storchaka00939072016-10-27 21:05:49 +03003682 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3683 "PyUnicode_AsDecodedObject() is deprecated; "
3684 "use PyCodec_Decode() to decode from str", 1) < 0)
3685 return NULL;
3686
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003687 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003689
3690 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003691 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003692}
3693
Alexander Belopolsky40018472011-02-26 01:02:56 +00003694PyObject *
3695PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003696 const char *encoding,
3697 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003698{
3699 PyObject *v;
3700
3701 if (!PyUnicode_Check(unicode)) {
3702 PyErr_BadArgument();
3703 goto onError;
3704 }
3705
Serhiy Storchaka00939072016-10-27 21:05:49 +03003706 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3707 "PyUnicode_AsDecodedUnicode() is deprecated; "
3708 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3709 return NULL;
3710
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003711 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003713
3714 /* Decode via the codec registry */
3715 v = PyCodec_Decode(unicode, encoding, errors);
3716 if (v == NULL)
3717 goto onError;
3718 if (!PyUnicode_Check(v)) {
3719 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003720 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003721 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003722 encoding,
3723 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003724 Py_DECREF(v);
3725 goto onError;
3726 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003727 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003728
Benjamin Peterson29060642009-01-31 22:14:21 +00003729 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003730 return NULL;
3731}
3732
Alexander Belopolsky40018472011-02-26 01:02:56 +00003733PyObject *
3734PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003735 Py_ssize_t size,
3736 const char *encoding,
3737 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738{
3739 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003740
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003741 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3745 Py_DECREF(unicode);
3746 return v;
3747}
3748
Alexander Belopolsky40018472011-02-26 01:02:56 +00003749PyObject *
3750PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003751 const char *encoding,
3752 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003753{
3754 PyObject *v;
3755
3756 if (!PyUnicode_Check(unicode)) {
3757 PyErr_BadArgument();
3758 goto onError;
3759 }
3760
Serhiy Storchaka00939072016-10-27 21:05:49 +03003761 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3762 "PyUnicode_AsEncodedObject() is deprecated; "
3763 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3764 "or PyCodec_Encode() for generic encoding", 1) < 0)
3765 return NULL;
3766
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003767 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003768 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003769
3770 /* Encode via the codec registry */
3771 v = PyCodec_Encode(unicode, encoding, errors);
3772 if (v == NULL)
3773 goto onError;
3774 return v;
3775
Benjamin Peterson29060642009-01-31 22:14:21 +00003776 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003777 return NULL;
3778}
3779
Victor Stinner1b579672011-12-17 05:47:23 +01003780
Victor Stinner2cba6b82018-01-10 22:46:15 +01003781static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003782unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003783 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003784{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003785 Py_ssize_t wlen;
3786 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3787 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003788 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003789 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003790
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003791 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003792 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003793 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003794 return NULL;
3795 }
3796
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003797 char *str;
3798 size_t error_pos;
3799 const char *reason;
3800 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003801 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003802 PyMem_Free(wstr);
3803
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003804 if (res != 0) {
3805 if (res == -2) {
3806 PyObject *exc;
3807 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3808 "locale", unicode,
3809 (Py_ssize_t)error_pos,
3810 (Py_ssize_t)(error_pos+1),
3811 reason);
3812 if (exc != NULL) {
3813 PyCodec_StrictErrors(exc);
3814 Py_DECREF(exc);
3815 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003816 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003817 else if (res == -3) {
3818 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3819 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003820 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003821 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003822 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003823 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003824 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003825
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003826 PyObject *bytes = PyBytes_FromString(str);
3827 PyMem_RawFree(str);
3828 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003829}
3830
Victor Stinnerad158722010-10-27 00:25:46 +00003831PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003832PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3833{
Victor Stinner709d23d2019-05-02 14:56:30 -04003834 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3835 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003836}
3837
3838PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003839PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003840{
Victor Stinner81a7be32020-04-14 15:14:01 +02003841 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003842 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3843 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003844 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003845 fs_codec->error_handler,
3846 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003847 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003848#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003849 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003850 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003851 fs_codec->encoding,
3852 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003853 }
Victor Stinnerad158722010-10-27 00:25:46 +00003854#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003855 else {
3856 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3857 machinery is not ready and so cannot be used:
3858 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003859 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3860 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003861 assert(filesystem_errors != NULL);
3862 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3863 assert(errors != _Py_ERROR_UNKNOWN);
3864#ifdef _Py_FORCE_UTF8_FS_ENCODING
3865 return unicode_encode_utf8(unicode, errors, NULL);
3866#else
3867 return unicode_encode_locale(unicode, errors, 0);
3868#endif
3869 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003870}
3871
Alexander Belopolsky40018472011-02-26 01:02:56 +00003872PyObject *
3873PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003874 const char *encoding,
3875 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876{
3877 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003878 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003879
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880 if (!PyUnicode_Check(unicode)) {
3881 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003882 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 }
Fred Drakee4315f52000-05-09 19:53:39 +00003884
Victor Stinner22eb6892019-06-26 00:51:05 +02003885 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3886 return NULL;
3887 }
3888
Victor Stinner942889a2016-09-05 15:40:10 -07003889 if (encoding == NULL) {
3890 return _PyUnicode_AsUTF8String(unicode, errors);
3891 }
3892
Fred Drakee4315f52000-05-09 19:53:39 +00003893 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003894 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3895 char *lower = buflower;
3896
3897 /* Fast paths */
3898 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3899 lower += 3;
3900 if (*lower == '_') {
3901 /* Match "utf8" and "utf_8" */
3902 lower++;
3903 }
3904
3905 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003907 }
3908 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3909 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3910 }
3911 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3912 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3913 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003914 }
Victor Stinner942889a2016-09-05 15:40:10 -07003915 else {
3916 if (strcmp(lower, "ascii") == 0
3917 || strcmp(lower, "us_ascii") == 0) {
3918 return _PyUnicode_AsASCIIString(unicode, errors);
3919 }
Steve Dowercc16be82016-09-08 10:35:16 -07003920#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003921 else if (strcmp(lower, "mbcs") == 0) {
3922 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3923 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003924#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003925 else if (strcmp(lower, "latin1") == 0 ||
3926 strcmp(lower, "latin_1") == 0 ||
3927 strcmp(lower, "iso_8859_1") == 0 ||
3928 strcmp(lower, "iso8859_1") == 0) {
3929 return _PyUnicode_AsLatin1String(unicode, errors);
3930 }
3931 }
Victor Stinner37296e82010-06-10 13:36:23 +00003932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933
3934 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003935 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003937 return NULL;
3938
3939 /* The normal path */
3940 if (PyBytes_Check(v))
3941 return v;
3942
3943 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003944 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003945 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003946 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003947
3948 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003949 "encoder %s returned bytearray instead of bytes; "
3950 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003951 encoding);
3952 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003953 Py_DECREF(v);
3954 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003955 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003956
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003957 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3958 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003959 Py_DECREF(v);
3960 return b;
3961 }
3962
3963 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003964 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003965 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003966 encoding,
3967 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003968 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003969 return NULL;
3970}
3971
Alexander Belopolsky40018472011-02-26 01:02:56 +00003972PyObject *
3973PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003974 const char *encoding,
3975 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003976{
3977 PyObject *v;
3978
3979 if (!PyUnicode_Check(unicode)) {
3980 PyErr_BadArgument();
3981 goto onError;
3982 }
3983
Serhiy Storchaka00939072016-10-27 21:05:49 +03003984 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3985 "PyUnicode_AsEncodedUnicode() is deprecated; "
3986 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3987 return NULL;
3988
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003989 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003991
3992 /* Encode via the codec registry */
3993 v = PyCodec_Encode(unicode, encoding, errors);
3994 if (v == NULL)
3995 goto onError;
3996 if (!PyUnicode_Check(v)) {
3997 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003998 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003999 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02004000 encoding,
4001 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00004002 Py_DECREF(v);
4003 goto onError;
4004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 return v;
Tim Petersced69f82003-09-16 20:30:58 +00004006
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 return NULL;
4009}
4010
Victor Stinner2cba6b82018-01-10 22:46:15 +01004011static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04004012unicode_decode_locale(const char *str, Py_ssize_t len,
4013 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004014{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004015 if (str[len] != '\0' || (size_t)len != strlen(str)) {
4016 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004017 return NULL;
4018 }
4019
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004020 wchar_t *wstr;
4021 size_t wlen;
4022 const char *reason;
4023 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04004024 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004025 if (res != 0) {
4026 if (res == -2) {
4027 PyObject *exc;
4028 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4029 "locale", str, len,
4030 (Py_ssize_t)wlen,
4031 (Py_ssize_t)(wlen + 1),
4032 reason);
4033 if (exc != NULL) {
4034 PyCodec_StrictErrors(exc);
4035 Py_DECREF(exc);
4036 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01004037 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02004038 else if (res == -3) {
4039 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4040 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01004041 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004042 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01004043 }
Victor Stinner2f197072011-12-17 07:08:30 +01004044 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01004045 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004046
4047 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4048 PyMem_RawFree(wstr);
4049 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004050}
4051
4052PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01004053PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4054 const char *errors)
4055{
Victor Stinner709d23d2019-05-02 14:56:30 -04004056 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4057 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01004058}
4059
4060PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01004061PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004062{
4063 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04004064 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4065 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004066}
4067
4068
4069PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00004070PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004071 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004072 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4073}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004074
Christian Heimes5894ba72007-11-04 11:43:14 +00004075PyObject*
4076PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4077{
Victor Stinner81a7be32020-04-14 15:14:01 +02004078 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004079 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4080 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004081 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004082 fs_codec->error_handler,
4083 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004084 NULL);
4085 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004086#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004087 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004088 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004089 fs_codec->encoding,
4090 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004091 }
Victor Stinnerad158722010-10-27 00:25:46 +00004092#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004093 else {
4094 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4095 machinery is not ready and so cannot be used:
4096 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004097 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4098 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004099 assert(filesystem_errors != NULL);
4100 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4101 assert(errors != _Py_ERROR_UNKNOWN);
4102#ifdef _Py_FORCE_UTF8_FS_ENCODING
4103 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4104#else
4105 return unicode_decode_locale(s, size, errors, 0);
4106#endif
4107 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004108}
4109
Martin v. Löwis011e8422009-05-05 04:43:17 +00004110
4111int
4112PyUnicode_FSConverter(PyObject* arg, void* addr)
4113{
Brett Cannonec6ce872016-09-06 15:50:29 -07004114 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004115 PyObject *output = NULL;
4116 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004117 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004118 if (arg == NULL) {
4119 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004120 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004121 return 1;
4122 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004123 path = PyOS_FSPath(arg);
4124 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004125 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004126 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004127 if (PyBytes_Check(path)) {
4128 output = path;
4129 }
4130 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4131 output = PyUnicode_EncodeFSDefault(path);
4132 Py_DECREF(path);
4133 if (!output) {
4134 return 0;
4135 }
4136 assert(PyBytes_Check(output));
4137 }
4138
Victor Stinner0ea2a462010-04-30 00:22:08 +00004139 size = PyBytes_GET_SIZE(output);
4140 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004141 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004142 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004143 Py_DECREF(output);
4144 return 0;
4145 }
4146 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004147 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004148}
4149
4150
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004151int
4152PyUnicode_FSDecoder(PyObject* arg, void* addr)
4153{
Brett Cannona5711202016-09-06 19:36:01 -07004154 int is_buffer = 0;
4155 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004156 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004157 if (arg == NULL) {
4158 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004159 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004160 return 1;
4161 }
Brett Cannona5711202016-09-06 19:36:01 -07004162
4163 is_buffer = PyObject_CheckBuffer(arg);
4164 if (!is_buffer) {
4165 path = PyOS_FSPath(arg);
4166 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004167 return 0;
4168 }
Brett Cannona5711202016-09-06 19:36:01 -07004169 }
4170 else {
4171 path = arg;
4172 Py_INCREF(arg);
4173 }
4174
4175 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004176 output = path;
4177 }
4178 else if (PyBytes_Check(path) || is_buffer) {
4179 PyObject *path_bytes = NULL;
4180
4181 if (!PyBytes_Check(path) &&
4182 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004183 "path should be string, bytes, or os.PathLike, not %.200s",
4184 Py_TYPE(arg)->tp_name)) {
4185 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004186 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004187 }
4188 path_bytes = PyBytes_FromObject(path);
4189 Py_DECREF(path);
4190 if (!path_bytes) {
4191 return 0;
4192 }
4193 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4194 PyBytes_GET_SIZE(path_bytes));
4195 Py_DECREF(path_bytes);
4196 if (!output) {
4197 return 0;
4198 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004199 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004200 else {
4201 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004202 "path should be string, bytes, or os.PathLike, not %.200s",
4203 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004204 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004205 return 0;
4206 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004207 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004208 Py_DECREF(output);
4209 return 0;
4210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004211 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004212 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004213 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004214 Py_DECREF(output);
4215 return 0;
4216 }
4217 *(PyObject**)addr = output;
4218 return Py_CLEANUP_SUPPORTED;
4219}
4220
4221
Inada Naoki02a4d572020-02-27 13:48:59 +09004222static int unicode_fill_utf8(PyObject *unicode);
4223
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004224const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004225PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004226{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004227 if (!PyUnicode_Check(unicode)) {
4228 PyErr_BadArgument();
4229 return NULL;
4230 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004231 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004232 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004234 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004235 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004236 return NULL;
4237 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 }
4239
4240 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004241 *psize = PyUnicode_UTF8_LENGTH(unicode);
4242 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004243}
4244
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004245const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004248 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4249}
4250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004251Py_UNICODE *
4252PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004254 if (!PyUnicode_Check(unicode)) {
4255 PyErr_BadArgument();
4256 return NULL;
4257 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004258 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4259 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004260 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004261 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004262 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263
Serhiy Storchakac46db922018-10-23 22:58:24 +03004264 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4265 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4266 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01004269 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
Serhiy Storchakac46db922018-10-23 22:58:24 +03004270 if (w == NULL) {
4271 PyErr_NoMemory();
4272 return NULL;
4273 }
4274 unicode_copy_as_widechar(unicode, w, wlen + 1);
4275 _PyUnicode_WSTR(unicode) = w;
4276 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4277 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004278 }
4279 }
4280 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004281 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004282 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004283}
4284
Inada Naoki2c4928d2020-06-17 20:09:44 +09004285/* Deprecated APIs */
4286
4287_Py_COMP_DIAG_PUSH
4288_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4289
Alexander Belopolsky40018472011-02-26 01:02:56 +00004290Py_UNICODE *
4291PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004293 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294}
4295
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004296const Py_UNICODE *
4297_PyUnicode_AsUnicode(PyObject *unicode)
4298{
4299 Py_ssize_t size;
4300 const Py_UNICODE *wstr;
4301
4302 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4303 if (wstr && wcslen(wstr) != (size_t)size) {
4304 PyErr_SetString(PyExc_ValueError, "embedded null character");
4305 return NULL;
4306 }
4307 return wstr;
4308}
4309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004310
Alexander Belopolsky40018472011-02-26 01:02:56 +00004311Py_ssize_t
4312PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313{
4314 if (!PyUnicode_Check(unicode)) {
4315 PyErr_BadArgument();
4316 goto onError;
4317 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004318 if (_PyUnicode_WSTR(unicode) == NULL) {
4319 if (PyUnicode_AsUnicode(unicode) == NULL)
4320 goto onError;
4321 }
4322 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323
Benjamin Peterson29060642009-01-31 22:14:21 +00004324 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 return -1;
4326}
4327
Inada Naoki2c4928d2020-06-17 20:09:44 +09004328_Py_COMP_DIAG_POP
4329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004330Py_ssize_t
4331PyUnicode_GetLength(PyObject *unicode)
4332{
Victor Stinner07621332012-06-16 04:53:46 +02004333 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004334 PyErr_BadArgument();
4335 return -1;
4336 }
Victor Stinner07621332012-06-16 04:53:46 +02004337 if (PyUnicode_READY(unicode) == -1)
4338 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004339 return PyUnicode_GET_LENGTH(unicode);
4340}
4341
4342Py_UCS4
4343PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4344{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004345 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004346 int kind;
4347
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004348 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004349 PyErr_BadArgument();
4350 return (Py_UCS4)-1;
4351 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004352 if (PyUnicode_READY(unicode) == -1) {
4353 return (Py_UCS4)-1;
4354 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004355 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004356 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004357 return (Py_UCS4)-1;
4358 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004359 data = PyUnicode_DATA(unicode);
4360 kind = PyUnicode_KIND(unicode);
4361 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362}
4363
4364int
4365PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4366{
4367 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004368 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004369 return -1;
4370 }
Victor Stinner488fa492011-12-12 00:01:39 +01004371 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004372 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004373 PyErr_SetString(PyExc_IndexError, "string index out of range");
4374 return -1;
4375 }
Victor Stinner488fa492011-12-12 00:01:39 +01004376 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004377 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004378 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4379 PyErr_SetString(PyExc_ValueError, "character out of range");
4380 return -1;
4381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004382 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4383 index, ch);
4384 return 0;
4385}
4386
Alexander Belopolsky40018472011-02-26 01:02:56 +00004387const char *
4388PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004389{
Victor Stinner42cb4622010-09-01 19:39:01 +00004390 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004391}
4392
Victor Stinner554f3f02010-06-16 23:33:54 +00004393/* create or adjust a UnicodeDecodeError */
4394static void
4395make_decode_exception(PyObject **exceptionObject,
4396 const char *encoding,
4397 const char *input, Py_ssize_t length,
4398 Py_ssize_t startpos, Py_ssize_t endpos,
4399 const char *reason)
4400{
4401 if (*exceptionObject == NULL) {
4402 *exceptionObject = PyUnicodeDecodeError_Create(
4403 encoding, input, length, startpos, endpos, reason);
4404 }
4405 else {
4406 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4407 goto onError;
4408 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4409 goto onError;
4410 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4411 goto onError;
4412 }
4413 return;
4414
4415onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004416 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004417}
4418
Steve Dowercc16be82016-09-08 10:35:16 -07004419#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004420static int
4421widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4422{
4423 if (newsize > *size) {
4424 wchar_t *newbuf = *buf;
4425 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4426 PyErr_NoMemory();
4427 return -1;
4428 }
4429 *buf = newbuf;
4430 }
4431 *size = newsize;
4432 return 0;
4433}
4434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435/* error handling callback helper:
4436 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004437 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 and adjust various state variables.
4439 return 0 on success, -1 on error
4440*/
4441
Alexander Belopolsky40018472011-02-26 01:02:56 +00004442static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443unicode_decode_call_errorhandler_wchar(
4444 const char *errors, PyObject **errorHandler,
4445 const char *encoding, const char *reason,
4446 const char **input, const char **inend, Py_ssize_t *startinpos,
4447 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004448 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004450 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451
4452 PyObject *restuple = NULL;
4453 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004454 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004455 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004456 Py_ssize_t requiredsize;
4457 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004458 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004459 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460
4461 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 *errorHandler = PyCodec_LookupError(errors);
4463 if (*errorHandler == NULL)
4464 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 }
4466
Victor Stinner554f3f02010-06-16 23:33:54 +00004467 make_decode_exception(exceptionObject,
4468 encoding,
4469 *input, *inend - *input,
4470 *startinpos, *endinpos,
4471 reason);
4472 if (*exceptionObject == NULL)
4473 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474
Petr Viktorinffd97532020-02-11 17:46:57 +01004475 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004479 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004482 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004484
4485 /* Copy back the bytes variables, which might have been modified by the
4486 callback */
4487 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4488 if (!inputobj)
4489 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004490 *input = PyBytes_AS_STRING(inputobj);
4491 insize = PyBytes_GET_SIZE(inputobj);
4492 *inend = *input + insize;
4493 /* we can DECREF safely, as the exception has another reference,
4494 so the object won't go away. */
4495 Py_DECREF(inputobj);
4496
4497 if (newpos<0)
4498 newpos = insize+newpos;
4499 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004500 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004501 goto onError;
4502 }
4503
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004504#if USE_UNICODE_WCHAR_CACHE
4505_Py_COMP_DIAG_PUSH
4506_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4507 repwlen = PyUnicode_GetSize(repunicode);
4508 if (repwlen < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004509 goto onError;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004510_Py_COMP_DIAG_POP
4511#else /* USE_UNICODE_WCHAR_CACHE */
4512 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4513 if (repwlen < 0)
4514 goto onError;
4515 repwlen--;
4516#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004517 /* need more space? (at least enough for what we
4518 have+the replacement+the rest of the string (starting
4519 at the new input position), so we won't have to check space
4520 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004521 requiredsize = *outpos;
4522 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4523 goto overflow;
4524 requiredsize += repwlen;
4525 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4526 goto overflow;
4527 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004528 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004530 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004531 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004532 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004534 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004535 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004536 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004537 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004538 *endinpos = newpos;
4539 *inptr = *input + newpos;
4540
4541 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004542 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004543 return 0;
4544
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004545 overflow:
4546 PyErr_SetString(PyExc_OverflowError,
4547 "decoded result is too long for a Python string");
4548
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004549 onError:
4550 Py_XDECREF(restuple);
4551 return -1;
4552}
Steve Dowercc16be82016-09-08 10:35:16 -07004553#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004554
4555static int
4556unicode_decode_call_errorhandler_writer(
4557 const char *errors, PyObject **errorHandler,
4558 const char *encoding, const char *reason,
4559 const char **input, const char **inend, Py_ssize_t *startinpos,
4560 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4561 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4562{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004563 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004564
4565 PyObject *restuple = NULL;
4566 PyObject *repunicode = NULL;
4567 Py_ssize_t insize;
4568 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004569 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004570 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004571 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004572 int need_to_grow = 0;
4573 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004574
4575 if (*errorHandler == NULL) {
4576 *errorHandler = PyCodec_LookupError(errors);
4577 if (*errorHandler == NULL)
4578 goto onError;
4579 }
4580
4581 make_decode_exception(exceptionObject,
4582 encoding,
4583 *input, *inend - *input,
4584 *startinpos, *endinpos,
4585 reason);
4586 if (*exceptionObject == NULL)
4587 goto onError;
4588
Petr Viktorinffd97532020-02-11 17:46:57 +01004589 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004590 if (restuple == NULL)
4591 goto onError;
4592 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004593 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004594 goto onError;
4595 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004596 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004597 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004598
4599 /* Copy back the bytes variables, which might have been modified by the
4600 callback */
4601 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4602 if (!inputobj)
4603 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004604 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004605 *input = PyBytes_AS_STRING(inputobj);
4606 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004607 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004608 /* we can DECREF safely, as the exception has another reference,
4609 so the object won't go away. */
4610 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004611
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004614 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004615 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004617 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618
Victor Stinner170ca6f2013-04-18 00:25:28 +02004619 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004620 if (replen > 1) {
4621 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004622 need_to_grow = 1;
4623 }
4624 new_inptr = *input + newpos;
4625 if (*inend - new_inptr > remain) {
4626 /* We don't know the decoding algorithm here so we make the worst
4627 assumption that one byte decodes to one unicode character.
4628 If unfortunately one byte could decode to more unicode characters,
4629 the decoder may write out-of-bound then. Is it possible for the
4630 algorithms using this function? */
4631 writer->min_length += *inend - new_inptr - remain;
4632 need_to_grow = 1;
4633 }
4634 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004635 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004636 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004637 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4638 goto onError;
4639 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004640 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004641 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004644 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004647 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004648 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004652 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653}
4654
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655/* --- UTF-7 Codec -------------------------------------------------------- */
4656
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657/* See RFC2152 for details. We encode conservatively and decode liberally. */
4658
4659/* Three simple macros defining base-64. */
4660
4661/* Is c a base-64 character? */
4662
4663#define IS_BASE64(c) \
4664 (((c) >= 'A' && (c) <= 'Z') || \
4665 ((c) >= 'a' && (c) <= 'z') || \
4666 ((c) >= '0' && (c) <= '9') || \
4667 (c) == '+' || (c) == '/')
4668
4669/* given that c is a base-64 character, what is its base-64 value? */
4670
4671#define FROM_BASE64(c) \
4672 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4673 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4674 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4675 (c) == '+' ? 62 : 63)
4676
4677/* What is the base-64 character of the bottom 6 bits of n? */
4678
4679#define TO_BASE64(n) \
4680 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4681
4682/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4683 * decoded as itself. We are permissive on decoding; the only ASCII
4684 * byte not decoding to itself is the + which begins a base64
4685 * string. */
4686
4687#define DECODE_DIRECT(c) \
4688 ((c) <= 127 && (c) != '+')
4689
4690/* The UTF-7 encoder treats ASCII characters differently according to
4691 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4692 * the above). See RFC2152. This array identifies these different
4693 * sets:
4694 * 0 : "Set D"
4695 * alphanumeric and '(),-./:?
4696 * 1 : "Set O"
4697 * !"#$%&*;<=>@[]^_`{|}
4698 * 2 : "whitespace"
4699 * ht nl cr sp
4700 * 3 : special (must be base64 encoded)
4701 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4702 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004703
Tim Petersced69f82003-09-16 20:30:58 +00004704static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705char utf7_category[128] = {
4706/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4707 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4708/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4709 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4710/* sp ! " # $ % & ' ( ) * + , - . / */
4711 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4712/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4713 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4714/* @ A B C D E F G H I J K L M N O */
4715 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4716/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4718/* ` a b c d e f g h i j k l m n o */
4719 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4720/* p q r s t u v w x y z { | } ~ del */
4721 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722};
4723
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724/* ENCODE_DIRECT: this character should be encoded as itself. The
4725 * answer depends on whether we are encoding set O as itself, and also
4726 * on whether we are encoding whitespace as itself. RFC2152 makes it
4727 * clear that the answers to these questions vary between
4728 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004729
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730#define ENCODE_DIRECT(c, directO, directWS) \
4731 ((c) < 128 && (c) > 0 && \
4732 ((utf7_category[(c)] == 0) || \
4733 (directWS && (utf7_category[(c)] == 2)) || \
4734 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735
Alexander Belopolsky40018472011-02-26 01:02:56 +00004736PyObject *
4737PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004738 Py_ssize_t size,
4739 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004741 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4742}
4743
Antoine Pitrou244651a2009-05-04 18:56:13 +00004744/* The decoder. The only state we preserve is our read position,
4745 * i.e. how many characters we have consumed. So if we end in the
4746 * middle of a shift sequence we have to back off the read position
4747 * and the output to the beginning of the sequence, otherwise we lose
4748 * all the shift state (seen bits, number of bits seen, high
4749 * surrogate). */
4750
Alexander Belopolsky40018472011-02-26 01:02:56 +00004751PyObject *
4752PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004753 Py_ssize_t size,
4754 const char *errors,
4755 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004756{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004758 Py_ssize_t startinpos;
4759 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004760 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004761 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004762 const char *errmsg = "";
4763 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004764 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 unsigned int base64bits = 0;
4766 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004767 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 PyObject *errorHandler = NULL;
4769 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004771 if (size == 0) {
4772 if (consumed)
4773 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004774 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004775 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004776
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004777 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004778 _PyUnicodeWriter_Init(&writer);
4779 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004780
4781 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004782 e = s + size;
4783
4784 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004785 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004787 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004788
Antoine Pitrou244651a2009-05-04 18:56:13 +00004789 if (inShift) { /* in a base-64 section */
4790 if (IS_BASE64(ch)) { /* consume a base-64 character */
4791 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4792 base64bits += 6;
4793 s++;
4794 if (base64bits >= 16) {
4795 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004796 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 base64bits -= 16;
4798 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004799 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 if (surrogate) {
4801 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004802 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4803 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004804 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004805 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004806 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004807 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808 }
4809 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004810 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004811 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004812 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 }
4814 }
Victor Stinner551ac952011-11-29 22:58:13 +01004815 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 /* first surrogate */
4817 surrogate = outCh;
4818 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004819 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004820 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004821 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 }
4823 }
4824 }
4825 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004826 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827 if (base64bits > 0) { /* left-over bits */
4828 if (base64bits >= 6) {
4829 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004830 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004831 errmsg = "partial character in shift sequence";
4832 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004833 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004834 else {
4835 /* Some bits remain; they should be zero */
4836 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004837 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004838 errmsg = "non-zero padding bits in shift sequence";
4839 goto utf7Error;
4840 }
4841 }
4842 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004843 if (surrogate && DECODE_DIRECT(ch)) {
4844 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4845 goto onError;
4846 }
4847 surrogate = 0;
4848 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004849 /* '-' is absorbed; other terminating
4850 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004851 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004852 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004853 }
4854 }
4855 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004857 s++; /* consume '+' */
4858 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004859 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004860 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004861 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004862 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004863 else if (s < e && !IS_BASE64(*s)) {
4864 s++;
4865 errmsg = "ill-formed sequence";
4866 goto utf7Error;
4867 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004868 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004869 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004870 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004871 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004872 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004873 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004874 }
4875 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004876 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004877 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004878 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004879 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004880 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004881 else {
4882 startinpos = s-starts;
4883 s++;
4884 errmsg = "unexpected special character";
4885 goto utf7Error;
4886 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004887 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004888utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004890 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 errors, &errorHandler,
4892 "utf7", errmsg,
4893 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004894 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004895 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004896 }
4897
Antoine Pitrou244651a2009-05-04 18:56:13 +00004898 /* end of string */
4899
4900 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4901 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004902 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004903 if (surrogate ||
4904 (base64bits >= 6) ||
4905 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004906 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004907 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004908 errors, &errorHandler,
4909 "utf7", "unterminated shift sequence",
4910 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004911 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004912 goto onError;
4913 if (s < e)
4914 goto restart;
4915 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004916 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004917
4918 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004919 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004920 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004921 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004922 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004923 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004924 writer.kind, writer.data, shiftOutStart);
4925 Py_XDECREF(errorHandler);
4926 Py_XDECREF(exc);
4927 _PyUnicodeWriter_Dealloc(&writer);
4928 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004929 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004930 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004931 }
4932 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004933 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004934 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004935 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004936
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937 Py_XDECREF(errorHandler);
4938 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004939 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004940
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004942 Py_XDECREF(errorHandler);
4943 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004944 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004945 return NULL;
4946}
4947
4948
Alexander Belopolsky40018472011-02-26 01:02:56 +00004949PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004950_PyUnicode_EncodeUTF7(PyObject *str,
4951 int base64SetO,
4952 int base64WhiteSpace,
4953 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004954{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004955 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004956 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004957 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004958 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004959 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004960 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004961 unsigned int base64bits = 0;
4962 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004963 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004964 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004965
Benjamin Petersonbac79492012-01-14 13:34:47 -05004966 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004967 return NULL;
4968 kind = PyUnicode_KIND(str);
4969 data = PyUnicode_DATA(str);
4970 len = PyUnicode_GET_LENGTH(str);
4971
4972 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004974
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004975 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004976 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004977 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004978 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004979 if (v == NULL)
4980 return NULL;
4981
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004982 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004983 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004984 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004985
Antoine Pitrou244651a2009-05-04 18:56:13 +00004986 if (inShift) {
4987 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4988 /* shifting out */
4989 if (base64bits) { /* output remaining bits */
4990 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4991 base64buffer = 0;
4992 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004993 }
4994 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004995 /* Characters not in the BASE64 set implicitly unshift the sequence
4996 so no '-' is required, except if the character is itself a '-' */
4997 if (IS_BASE64(ch) || ch == '-') {
4998 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004999 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005000 *out++ = (char) ch;
5001 }
5002 else {
5003 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00005004 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005005 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005006 else { /* not in a shift sequence */
5007 if (ch == '+') {
5008 *out++ = '+';
5009 *out++ = '-';
5010 }
5011 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5012 *out++ = (char) ch;
5013 }
5014 else {
5015 *out++ = '+';
5016 inShift = 1;
5017 goto encode_char;
5018 }
5019 }
5020 continue;
5021encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00005022 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005023 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01005024
Antoine Pitrou244651a2009-05-04 18:56:13 +00005025 /* code first surrogate */
5026 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01005027 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00005028 while (base64bits >= 6) {
5029 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5030 base64bits -= 6;
5031 }
5032 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01005033 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00005034 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005035 base64bits += 16;
5036 base64buffer = (base64buffer << 16) | ch;
5037 while (base64bits >= 6) {
5038 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5039 base64bits -= 6;
5040 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00005041 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005042 if (base64bits)
5043 *out++= TO_BASE64(base64buffer << (6-base64bits) );
5044 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005045 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005046 if (_PyBytes_Resize(&v, out - start) < 0)
5047 return NULL;
5048 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005049}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005050PyObject *
5051PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5052 Py_ssize_t size,
5053 int base64SetO,
5054 int base64WhiteSpace,
5055 const char *errors)
5056{
5057 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005058 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005059 if (tmp == NULL)
5060 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01005061 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005062 base64WhiteSpace, errors);
5063 Py_DECREF(tmp);
5064 return result;
5065}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005066
Antoine Pitrou244651a2009-05-04 18:56:13 +00005067#undef IS_BASE64
5068#undef FROM_BASE64
5069#undef TO_BASE64
5070#undef DECODE_DIRECT
5071#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005072
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073/* --- UTF-8 Codec -------------------------------------------------------- */
5074
Alexander Belopolsky40018472011-02-26 01:02:56 +00005075PyObject *
5076PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005077 Py_ssize_t size,
5078 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079{
Walter Dörwald69652032004-09-07 20:24:22 +00005080 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5081}
5082
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083#include "stringlib/asciilib.h"
5084#include "stringlib/codecs.h"
5085#include "stringlib/undef.h"
5086
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005087#include "stringlib/ucs1lib.h"
5088#include "stringlib/codecs.h"
5089#include "stringlib/undef.h"
5090
5091#include "stringlib/ucs2lib.h"
5092#include "stringlib/codecs.h"
5093#include "stringlib/undef.h"
5094
5095#include "stringlib/ucs4lib.h"
5096#include "stringlib/codecs.h"
5097#include "stringlib/undef.h"
5098
Ma Lina0c603c2020-10-18 22:48:38 +08005099/* Mask to quickly check whether a C 'size_t' contains a
Antoine Pitrouab868312009-01-10 15:40:25 +00005100 non-ASCII, UTF8-encoded char. */
Ma Lina0c603c2020-10-18 22:48:38 +08005101#if (SIZEOF_SIZE_T == 8)
5102# define ASCII_CHAR_MASK 0x8080808080808080ULL
5103#elif (SIZEOF_SIZE_T == 4)
5104# define ASCII_CHAR_MASK 0x80808080U
Antoine Pitrouab868312009-01-10 15:40:25 +00005105#else
Ma Lina0c603c2020-10-18 22:48:38 +08005106# error C 'size_t' size should be either 4 or 8!
Antoine Pitrouab868312009-01-10 15:40:25 +00005107#endif
5108
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005109static Py_ssize_t
5110ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005111{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005112 const char *p = start;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113
Ma Lina0c603c2020-10-18 22:48:38 +08005114#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
Jessica Clarkedec07572021-03-31 11:12:39 +01005115 assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5116 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005117 /* Fast path, see in STRINGLIB(utf8_decode) for
5118 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005119 /* Help allocation */
5120 const char *_p = p;
5121 Py_UCS1 * q = dest;
Jessica Clarkedec07572021-03-31 11:12:39 +01005122 while (_p + SIZEOF_SIZE_T <= end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005123 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005126 *((size_t *)q) = value;
5127 _p += SIZEOF_SIZE_T;
5128 q += SIZEOF_SIZE_T;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005129 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 p = _p;
5131 while (p < end) {
5132 if ((unsigned char)*p & 0x80)
5133 break;
5134 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005138#endif
5139 while (p < end) {
5140 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5141 for an explanation. */
Jessica Clarkedec07572021-03-31 11:12:39 +01005142 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005143 /* Help allocation */
5144 const char *_p = p;
Jessica Clarkedec07572021-03-31 11:12:39 +01005145 while (_p + SIZEOF_SIZE_T <= end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005146 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 if (value & ASCII_CHAR_MASK)
5148 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005149 _p += SIZEOF_SIZE_T;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005150 }
5151 p = _p;
5152 if (_p == end)
5153 break;
5154 }
5155 if ((unsigned char)*p & 0x80)
5156 break;
5157 ++p;
5158 }
5159 memcpy(dest, start, p - start);
5160 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161}
Antoine Pitrouab868312009-01-10 15:40:25 +00005162
Victor Stinner709d23d2019-05-02 14:56:30 -04005163static PyObject *
5164unicode_decode_utf8(const char *s, Py_ssize_t size,
5165 _Py_error_handler error_handler, const char *errors,
5166 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005167{
Victor Stinner785938e2011-12-11 20:09:03 +01005168 if (size == 0) {
5169 if (consumed)
5170 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005171 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005172 }
5173
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005174 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5175 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005176 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005177 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005178 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005179 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005180 }
5181
Inada Naoki770847a2019-06-24 12:30:24 +09005182 const char *starts = s;
5183 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005184
Inada Naoki770847a2019-06-24 12:30:24 +09005185 // fast path: try ASCII string.
5186 PyObject *u = PyUnicode_New(size, 127);
5187 if (u == NULL) {
5188 return NULL;
5189 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005190 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005191 if (s == end) {
5192 return u;
5193 }
5194
5195 // Use _PyUnicodeWriter after fast path is failed.
5196 _PyUnicodeWriter writer;
5197 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5198 writer.pos = s - starts;
5199
5200 Py_ssize_t startinpos, endinpos;
5201 const char *errmsg = "";
5202 PyObject *error_handler_obj = NULL;
5203 PyObject *exc = NULL;
5204
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005205 while (s < end) {
5206 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005207 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005208
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005209 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005210 if (PyUnicode_IS_ASCII(writer.buffer))
5211 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005212 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005213 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005214 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005215 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005216 } else {
5217 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005218 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005219 }
5220
5221 switch (ch) {
5222 case 0:
5223 if (s == end || consumed)
5224 goto End;
5225 errmsg = "unexpected end of data";
5226 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005227 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005228 break;
5229 case 1:
5230 errmsg = "invalid start byte";
5231 startinpos = s - starts;
5232 endinpos = startinpos + 1;
5233 break;
5234 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005235 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5236 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5237 {
5238 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005239 goto End;
5240 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005241 /* fall through */
5242 case 3:
5243 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005244 errmsg = "invalid continuation byte";
5245 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005246 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005247 break;
5248 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005249 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005250 goto onError;
5251 continue;
5252 }
5253
Victor Stinner1d65d912015-10-05 13:43:50 +02005254 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005255 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005256
5257 switch (error_handler) {
5258 case _Py_ERROR_IGNORE:
5259 s += (endinpos - startinpos);
5260 break;
5261
5262 case _Py_ERROR_REPLACE:
5263 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5264 goto onError;
5265 s += (endinpos - startinpos);
5266 break;
5267
5268 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005269 {
5270 Py_ssize_t i;
5271
Victor Stinner1d65d912015-10-05 13:43:50 +02005272 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5273 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005274 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005275 ch = (Py_UCS4)(unsigned char)(starts[i]);
5276 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5277 ch + 0xdc00);
5278 writer.pos++;
5279 }
5280 s += (endinpos - startinpos);
5281 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005282 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005283
5284 default:
5285 if (unicode_decode_call_errorhandler_writer(
5286 errors, &error_handler_obj,
5287 "utf-8", errmsg,
5288 &starts, &end, &startinpos, &endinpos, &exc, &s,
5289 &writer))
5290 goto onError;
5291 }
Victor Stinner785938e2011-12-11 20:09:03 +01005292 }
5293
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005294End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005295 if (consumed)
5296 *consumed = s - starts;
5297
Victor Stinner1d65d912015-10-05 13:43:50 +02005298 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005299 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005300 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005301
5302onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005303 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005304 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005305 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005306 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005307}
5308
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005309
Victor Stinner709d23d2019-05-02 14:56:30 -04005310PyObject *
5311PyUnicode_DecodeUTF8Stateful(const char *s,
5312 Py_ssize_t size,
5313 const char *errors,
5314 Py_ssize_t *consumed)
5315{
5316 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5317}
5318
5319
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005320/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5321 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005322
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005323 On success, write a pointer to a newly allocated wide character string into
5324 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5325 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005326
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005327 On memory allocation failure, return -1.
5328
5329 On decoding error (if surrogateescape is zero), return -2. If wlen is
5330 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5331 is not NULL, write the decoding error message into *reason. */
5332int
5333_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005334 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005335{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005336 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005337 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005338 wchar_t *unicode;
5339 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005340
Victor Stinner3d4226a2018-08-29 22:21:32 +02005341 int surrogateescape = 0;
5342 int surrogatepass = 0;
5343 switch (errors)
5344 {
5345 case _Py_ERROR_STRICT:
5346 break;
5347 case _Py_ERROR_SURROGATEESCAPE:
5348 surrogateescape = 1;
5349 break;
5350 case _Py_ERROR_SURROGATEPASS:
5351 surrogatepass = 1;
5352 break;
5353 default:
5354 return -3;
5355 }
5356
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005357 /* Note: size will always be longer than the resulting Unicode
5358 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005359 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005360 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005361 }
5362
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005363 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005364 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005365 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005366 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005367
5368 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005369 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005370 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005371 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005372 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005373#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005374 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005375#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005376 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005377#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005378 if (ch > 0xFF) {
5379#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005380 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005381#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005382 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005383 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005384 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5385 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5386#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005387 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005388 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005389 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005390 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005391 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005392
5393 if (surrogateescape) {
5394 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5395 }
5396 else {
5397 /* Is it a valid three-byte code? */
5398 if (surrogatepass
5399 && (e - s) >= 3
5400 && (s[0] & 0xf0) == 0xe0
5401 && (s[1] & 0xc0) == 0x80
5402 && (s[2] & 0xc0) == 0x80)
5403 {
5404 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5405 s += 3;
5406 unicode[outpos++] = ch;
5407 }
5408 else {
5409 PyMem_RawFree(unicode );
5410 if (reason != NULL) {
5411 switch (ch) {
5412 case 0:
5413 *reason = "unexpected end of data";
5414 break;
5415 case 1:
5416 *reason = "invalid start byte";
5417 break;
5418 /* 2, 3, 4 */
5419 default:
5420 *reason = "invalid continuation byte";
5421 break;
5422 }
5423 }
5424 if (wlen != NULL) {
5425 *wlen = s - orig_s;
5426 }
5427 return -2;
5428 }
5429 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005430 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005431 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005432 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005433 if (wlen) {
5434 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005435 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005436 *wstr = unicode;
5437 return 0;
5438}
5439
Victor Stinner5f9cf232019-03-19 01:46:25 +01005440
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005441wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005442_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5443 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005444{
5445 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005446 int res = _Py_DecodeUTF8Ex(arg, arglen,
5447 &wstr, wlen,
5448 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005449 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005450 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5451 assert(res != -3);
5452 if (wlen) {
5453 *wlen = (size_t)res;
5454 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005455 return NULL;
5456 }
5457 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005458}
5459
Antoine Pitrouab868312009-01-10 15:40:25 +00005460
Victor Stinnere47e6982017-12-21 15:45:16 +01005461/* UTF-8 encoder using the surrogateescape error handler .
5462
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005463 On success, return 0 and write the newly allocated character string (use
5464 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005465
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005466 On encoding failure, return -2 and write the position of the invalid
5467 surrogate character into *error_pos (if error_pos is set) and the decoding
5468 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005469
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005470 On memory allocation failure, return -1. */
5471int
5472_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005473 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005474{
5475 const Py_ssize_t max_char_size = 4;
5476 Py_ssize_t len = wcslen(text);
5477
5478 assert(len >= 0);
5479
Victor Stinner3d4226a2018-08-29 22:21:32 +02005480 int surrogateescape = 0;
5481 int surrogatepass = 0;
5482 switch (errors)
5483 {
5484 case _Py_ERROR_STRICT:
5485 break;
5486 case _Py_ERROR_SURROGATEESCAPE:
5487 surrogateescape = 1;
5488 break;
5489 case _Py_ERROR_SURROGATEPASS:
5490 surrogatepass = 1;
5491 break;
5492 default:
5493 return -3;
5494 }
5495
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005496 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5497 return -1;
5498 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005499 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005500 if (raw_malloc) {
5501 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005502 }
5503 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005504 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005505 }
5506 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005507 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005508 }
5509
5510 char *p = bytes;
5511 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005512 for (i = 0; i < len; ) {
5513 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005514 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005515 i++;
5516#if Py_UNICODE_SIZE == 2
5517 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5518 && i < len
5519 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5520 {
5521 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5522 i++;
5523 }
5524#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005525
5526 if (ch < 0x80) {
5527 /* Encode ASCII */
5528 *p++ = (char) ch;
5529
5530 }
5531 else if (ch < 0x0800) {
5532 /* Encode Latin-1 */
5533 *p++ = (char)(0xc0 | (ch >> 6));
5534 *p++ = (char)(0x80 | (ch & 0x3f));
5535 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005536 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005537 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005538 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005539 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005540 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005541 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005542 if (reason != NULL) {
5543 *reason = "encoding error";
5544 }
5545 if (raw_malloc) {
5546 PyMem_RawFree(bytes);
5547 }
5548 else {
5549 PyMem_Free(bytes);
5550 }
5551 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005552 }
5553 *p++ = (char)(ch & 0xff);
5554 }
5555 else if (ch < 0x10000) {
5556 *p++ = (char)(0xe0 | (ch >> 12));
5557 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5558 *p++ = (char)(0x80 | (ch & 0x3f));
5559 }
5560 else { /* ch >= 0x10000 */
5561 assert(ch <= MAX_UNICODE);
5562 /* Encode UCS4 Unicode ordinals */
5563 *p++ = (char)(0xf0 | (ch >> 18));
5564 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5565 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5566 *p++ = (char)(0x80 | (ch & 0x3f));
5567 }
5568 }
5569 *p++ = '\0';
5570
5571 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005572 char *bytes2;
5573 if (raw_malloc) {
5574 bytes2 = PyMem_RawRealloc(bytes, final_size);
5575 }
5576 else {
5577 bytes2 = PyMem_Realloc(bytes, final_size);
5578 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005579 if (bytes2 == NULL) {
5580 if (error_pos != NULL) {
5581 *error_pos = (size_t)-1;
5582 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005583 if (raw_malloc) {
5584 PyMem_RawFree(bytes);
5585 }
5586 else {
5587 PyMem_Free(bytes);
5588 }
5589 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005590 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005591 *str = bytes2;
5592 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005593}
5594
5595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005596/* Primary internal function which creates utf8 encoded bytes objects.
5597
5598 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005599 and allocate exactly as much space needed at the end. Else allocate the
5600 maximum possible needed (4 result bytes per Unicode character), and return
5601 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005602*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005603static PyObject *
5604unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5605 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005607 if (!PyUnicode_Check(unicode)) {
5608 PyErr_BadArgument();
5609 return NULL;
5610 }
5611
5612 if (PyUnicode_READY(unicode) == -1)
5613 return NULL;
5614
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005615 if (PyUnicode_UTF8(unicode))
5616 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5617 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005618
Inada Naoki02a4d572020-02-27 13:48:59 +09005619 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005620 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005621 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5622
5623 _PyBytesWriter writer;
5624 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005625
Benjamin Petersonead6b532011-12-20 17:23:42 -06005626 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005627 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005628 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005629 case PyUnicode_1BYTE_KIND:
5630 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5631 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005632 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5633 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005634 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005635 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5636 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005637 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005638 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5639 break;
Tim Peters602f7402002-04-27 18:03:26 +00005640 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005641
5642 if (end == NULL) {
5643 _PyBytesWriter_Dealloc(&writer);
5644 return NULL;
5645 }
5646 return _PyBytesWriter_Finish(&writer, end);
5647}
5648
5649static int
5650unicode_fill_utf8(PyObject *unicode)
5651{
5652 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5653 assert(!PyUnicode_IS_ASCII(unicode));
5654
5655 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005656 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005657 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5658
5659 _PyBytesWriter writer;
5660 char *end;
5661
5662 switch (kind) {
5663 default:
5664 Py_UNREACHABLE();
5665 case PyUnicode_1BYTE_KIND:
5666 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5667 _Py_ERROR_STRICT, NULL);
5668 break;
5669 case PyUnicode_2BYTE_KIND:
5670 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5671 _Py_ERROR_STRICT, NULL);
5672 break;
5673 case PyUnicode_4BYTE_KIND:
5674 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5675 _Py_ERROR_STRICT, NULL);
5676 break;
5677 }
5678 if (end == NULL) {
5679 _PyBytesWriter_Dealloc(&writer);
5680 return -1;
5681 }
5682
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005683 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005684 PyBytes_AS_STRING(writer.buffer);
5685 Py_ssize_t len = end - start;
5686
Victor Stinner32bd68c2020-12-01 10:37:39 +01005687 char *cache = PyObject_Malloc(len + 1);
Inada Naoki02a4d572020-02-27 13:48:59 +09005688 if (cache == NULL) {
5689 _PyBytesWriter_Dealloc(&writer);
5690 PyErr_NoMemory();
5691 return -1;
5692 }
5693 _PyUnicode_UTF8(unicode) = cache;
5694 _PyUnicode_UTF8_LENGTH(unicode) = len;
5695 memcpy(cache, start, len);
5696 cache[len] = '\0';
5697 _PyBytesWriter_Dealloc(&writer);
5698 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699}
5700
Alexander Belopolsky40018472011-02-26 01:02:56 +00005701PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005702_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5703{
5704 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5705}
5706
5707
5708PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5710 Py_ssize_t size,
5711 const char *errors)
5712{
5713 PyObject *v, *unicode;
5714
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005715 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005716 if (unicode == NULL)
5717 return NULL;
5718 v = _PyUnicode_AsUTF8String(unicode, errors);
5719 Py_DECREF(unicode);
5720 return v;
5721}
5722
5723PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005724PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005726 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727}
5728
Walter Dörwald41980ca2007-08-16 21:55:45 +00005729/* --- UTF-32 Codec ------------------------------------------------------- */
5730
5731PyObject *
5732PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 Py_ssize_t size,
5734 const char *errors,
5735 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005736{
5737 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5738}
5739
5740PyObject *
5741PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 Py_ssize_t size,
5743 const char *errors,
5744 int *byteorder,
5745 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005746{
5747 const char *starts = s;
5748 Py_ssize_t startinpos;
5749 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005750 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005751 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005752 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005754 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005755 PyObject *errorHandler = NULL;
5756 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005757
Andy Lestere6be9b52020-02-11 20:28:35 -06005758 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005759 e = q + size;
5760
5761 if (byteorder)
5762 bo = *byteorder;
5763
5764 /* Check for BOM marks (U+FEFF) in the input and adjust current
5765 byte order setting accordingly. In native mode, the leading BOM
5766 mark is skipped, in all other modes, it is copied to the output
5767 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005768 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005769 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005770 if (bom == 0x0000FEFF) {
5771 bo = -1;
5772 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005774 else if (bom == 0xFFFE0000) {
5775 bo = 1;
5776 q += 4;
5777 }
5778 if (byteorder)
5779 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005780 }
5781
Victor Stinnere64322e2012-10-30 23:12:47 +01005782 if (q == e) {
5783 if (consumed)
5784 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005785 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005786 }
5787
Victor Stinnere64322e2012-10-30 23:12:47 +01005788#ifdef WORDS_BIGENDIAN
5789 le = bo < 0;
5790#else
5791 le = bo <= 0;
5792#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005793 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005794
Victor Stinner8f674cc2013-04-17 23:02:17 +02005795 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005796 writer.min_length = (e - q + 3) / 4;
5797 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005798 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005799
Victor Stinnere64322e2012-10-30 23:12:47 +01005800 while (1) {
5801 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005803
Victor Stinnere64322e2012-10-30 23:12:47 +01005804 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005805 enum PyUnicode_Kind kind = writer.kind;
5806 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005807 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005808 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005809 if (le) {
5810 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005811 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005812 if (ch > maxch)
5813 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005814 if (kind != PyUnicode_1BYTE_KIND &&
5815 Py_UNICODE_IS_SURROGATE(ch))
5816 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005817 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005818 q += 4;
5819 } while (q <= last);
5820 }
5821 else {
5822 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005823 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005824 if (ch > maxch)
5825 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005826 if (kind != PyUnicode_1BYTE_KIND &&
5827 Py_UNICODE_IS_SURROGATE(ch))
5828 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005829 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005830 q += 4;
5831 } while (q <= last);
5832 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005833 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005834 }
5835
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005836 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005837 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005838 startinpos = ((const char *)q) - starts;
5839 endinpos = startinpos + 4;
5840 }
5841 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005842 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005844 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005846 startinpos = ((const char *)q) - starts;
5847 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005849 else {
5850 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005851 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005852 goto onError;
5853 q += 4;
5854 continue;
5855 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005856 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005857 startinpos = ((const char *)q) - starts;
5858 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005860
5861 /* The remaining input chars are ignored if the callback
5862 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005863 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005865 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005867 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005869 }
5870
Walter Dörwald41980ca2007-08-16 21:55:45 +00005871 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005873
Walter Dörwald41980ca2007-08-16 21:55:45 +00005874 Py_XDECREF(errorHandler);
5875 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005876 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005879 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005880 Py_XDECREF(errorHandler);
5881 Py_XDECREF(exc);
5882 return NULL;
5883}
5884
5885PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005886_PyUnicode_EncodeUTF32(PyObject *str,
5887 const char *errors,
5888 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005889{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005890 enum PyUnicode_Kind kind;
5891 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005893 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005894 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005895#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005896 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005897#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005898 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005899#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005900 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005901 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005902 PyObject *errorHandler = NULL;
5903 PyObject *exc = NULL;
5904 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005905
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005906 if (!PyUnicode_Check(str)) {
5907 PyErr_BadArgument();
5908 return NULL;
5909 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005910 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 return NULL;
5912 kind = PyUnicode_KIND(str);
5913 data = PyUnicode_DATA(str);
5914 len = PyUnicode_GET_LENGTH(str);
5915
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005916 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005917 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005918 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005919 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005920 if (v == NULL)
5921 return NULL;
5922
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005923 /* output buffer is 4-bytes aligned */
5924 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005925 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005926 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005927 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005928 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005929 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005930
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005931 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005932 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005933 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005934 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005935 else
5936 encoding = "utf-32";
5937
5938 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005939 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5940 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005941 }
5942
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005943 pos = 0;
5944 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005945 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005946
5947 if (kind == PyUnicode_2BYTE_KIND) {
5948 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5949 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005950 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005951 else {
5952 assert(kind == PyUnicode_4BYTE_KIND);
5953 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5954 &out, native_ordering);
5955 }
5956 if (pos == len)
5957 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005958
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005959 rep = unicode_encode_call_errorhandler(
5960 errors, &errorHandler,
5961 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005962 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005963 if (!rep)
5964 goto error;
5965
5966 if (PyBytes_Check(rep)) {
5967 repsize = PyBytes_GET_SIZE(rep);
5968 if (repsize & 3) {
5969 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005970 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005971 "surrogates not allowed");
5972 goto error;
5973 }
5974 moreunits = repsize / 4;
5975 }
5976 else {
5977 assert(PyUnicode_Check(rep));
5978 if (PyUnicode_READY(rep) < 0)
5979 goto error;
5980 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5981 if (!PyUnicode_IS_ASCII(rep)) {
5982 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005983 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005984 "surrogates not allowed");
5985 goto error;
5986 }
5987 }
5988
5989 /* four bytes are reserved for each surrogate */
5990 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005991 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005992 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005993 /* integer overflow */
5994 PyErr_NoMemory();
5995 goto error;
5996 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005997 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005998 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005999 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006000 }
6001
6002 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006003 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006004 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006005 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006006 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006007 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6008 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006009 }
6010
6011 Py_CLEAR(rep);
6012 }
6013
6014 /* Cut back to size actually needed. This is necessary for, for example,
6015 encoding of a string containing isolated surrogates and the 'ignore'
6016 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006017 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006018 if (nsize != PyBytes_GET_SIZE(v))
6019 _PyBytes_Resize(&v, nsize);
6020 Py_XDECREF(errorHandler);
6021 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006022 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006023 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006024 error:
6025 Py_XDECREF(rep);
6026 Py_XDECREF(errorHandler);
6027 Py_XDECREF(exc);
6028 Py_XDECREF(v);
6029 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00006030}
6031
Alexander Belopolsky40018472011-02-26 01:02:56 +00006032PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006033PyUnicode_EncodeUTF32(const Py_UNICODE *s,
6034 Py_ssize_t size,
6035 const char *errors,
6036 int byteorder)
6037{
6038 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006039 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006040 if (tmp == NULL)
6041 return NULL;
6042 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6043 Py_DECREF(tmp);
6044 return result;
6045}
6046
6047PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006048PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00006049{
Victor Stinnerb960b342011-11-20 19:12:52 +01006050 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00006051}
6052
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053/* --- UTF-16 Codec ------------------------------------------------------- */
6054
Tim Peters772747b2001-08-09 22:21:55 +00006055PyObject *
6056PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 Py_ssize_t size,
6058 const char *errors,
6059 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060{
Walter Dörwald69652032004-09-07 20:24:22 +00006061 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6062}
6063
6064PyObject *
6065PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 Py_ssize_t size,
6067 const char *errors,
6068 int *byteorder,
6069 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00006070{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006072 Py_ssize_t startinpos;
6073 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006074 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006075 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006076 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006077 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006078 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 PyObject *errorHandler = NULL;
6080 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006081 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082
Andy Lestere6be9b52020-02-11 20:28:35 -06006083 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006084 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085
6086 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006087 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006089 /* Check for BOM marks (U+FEFF) in the input and adjust current
6090 byte order setting accordingly. In native mode, the leading BOM
6091 mark is skipped, in all other modes, it is copied to the output
6092 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006093 if (bo == 0 && size >= 2) {
6094 const Py_UCS4 bom = (q[1] << 8) | q[0];
6095 if (bom == 0xFEFF) {
6096 q += 2;
6097 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006099 else if (bom == 0xFFFE) {
6100 q += 2;
6101 bo = 1;
6102 }
6103 if (byteorder)
6104 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106
Antoine Pitrou63065d72012-05-15 23:48:04 +02006107 if (q == e) {
6108 if (consumed)
6109 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006110 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006111 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006112
Christian Heimes743e0cd2012-10-17 23:52:17 +02006113#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006114 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006115 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006116#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006117 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006118 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006119#endif
Tim Peters772747b2001-08-09 22:21:55 +00006120
Antoine Pitrou63065d72012-05-15 23:48:04 +02006121 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006122 character count normally. Error handler will take care of
6123 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006124 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006125 writer.min_length = (e - q + 1) / 2;
6126 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006127 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006128
Antoine Pitrou63065d72012-05-15 23:48:04 +02006129 while (1) {
6130 Py_UCS4 ch = 0;
6131 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006132 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006133 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006134 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006135 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006136 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006137 native_ordering);
6138 else
6139 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006140 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006141 native_ordering);
6142 } else if (kind == PyUnicode_2BYTE_KIND) {
6143 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006144 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006145 native_ordering);
6146 } else {
6147 assert(kind == PyUnicode_4BYTE_KIND);
6148 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006149 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006150 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006151 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006152 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153
Antoine Pitrou63065d72012-05-15 23:48:04 +02006154 switch (ch)
6155 {
6156 case 0:
6157 /* remaining byte at the end? (size should be even) */
6158 if (q == e || consumed)
6159 goto End;
6160 errmsg = "truncated data";
6161 startinpos = ((const char *)q) - starts;
6162 endinpos = ((const char *)e) - starts;
6163 break;
6164 /* The remaining input chars are ignored if the callback
6165 chooses to skip the input */
6166 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006167 q -= 2;
6168 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006169 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006170 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006171 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006172 endinpos = ((const char *)e) - starts;
6173 break;
6174 case 2:
6175 errmsg = "illegal encoding";
6176 startinpos = ((const char *)q) - 2 - starts;
6177 endinpos = startinpos + 2;
6178 break;
6179 case 3:
6180 errmsg = "illegal UTF-16 surrogate";
6181 startinpos = ((const char *)q) - 4 - starts;
6182 endinpos = startinpos + 2;
6183 break;
6184 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006185 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006186 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 continue;
6188 }
6189
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006190 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006191 errors,
6192 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006193 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006194 &starts,
6195 (const char **)&e,
6196 &startinpos,
6197 &endinpos,
6198 &exc,
6199 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006200 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 }
6203
Antoine Pitrou63065d72012-05-15 23:48:04 +02006204End:
Walter Dörwald69652032004-09-07 20:24:22 +00006205 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 Py_XDECREF(errorHandler);
6209 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006210 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006213 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 Py_XDECREF(errorHandler);
6215 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 return NULL;
6217}
6218
Tim Peters772747b2001-08-09 22:21:55 +00006219PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006220_PyUnicode_EncodeUTF16(PyObject *str,
6221 const char *errors,
6222 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006224 enum PyUnicode_Kind kind;
6225 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006227 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006228 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006229 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006230#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006231 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006232#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006233 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006234#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006235 const char *encoding;
6236 Py_ssize_t nsize, pos;
6237 PyObject *errorHandler = NULL;
6238 PyObject *exc = NULL;
6239 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006240
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006241 if (!PyUnicode_Check(str)) {
6242 PyErr_BadArgument();
6243 return NULL;
6244 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006245 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006246 return NULL;
6247 kind = PyUnicode_KIND(str);
6248 data = PyUnicode_DATA(str);
6249 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006250
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006251 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006252 if (kind == PyUnicode_4BYTE_KIND) {
6253 const Py_UCS4 *in = (const Py_UCS4 *)data;
6254 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006255 while (in < end) {
6256 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006257 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006258 }
6259 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006260 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006261 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006263 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006264 nsize = len + pairs + (byteorder == 0);
6265 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006266 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006270 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006271 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006272 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006273 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006274 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006275 }
6276 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006277 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006278 }
Tim Peters772747b2001-08-09 22:21:55 +00006279
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006280 if (kind == PyUnicode_1BYTE_KIND) {
6281 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6282 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006283 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006284
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006285 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006286 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006287 }
6288 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006289 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006290 }
6291 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006292 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006293 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006294
6295 pos = 0;
6296 while (pos < len) {
6297 Py_ssize_t repsize, moreunits;
6298
6299 if (kind == PyUnicode_2BYTE_KIND) {
6300 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6301 &out, native_ordering);
6302 }
6303 else {
6304 assert(kind == PyUnicode_4BYTE_KIND);
6305 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6306 &out, native_ordering);
6307 }
6308 if (pos == len)
6309 break;
6310
6311 rep = unicode_encode_call_errorhandler(
6312 errors, &errorHandler,
6313 encoding, "surrogates not allowed",
6314 str, &exc, pos, pos + 1, &pos);
6315 if (!rep)
6316 goto error;
6317
6318 if (PyBytes_Check(rep)) {
6319 repsize = PyBytes_GET_SIZE(rep);
6320 if (repsize & 1) {
6321 raise_encode_exception(&exc, encoding,
6322 str, pos - 1, pos,
6323 "surrogates not allowed");
6324 goto error;
6325 }
6326 moreunits = repsize / 2;
6327 }
6328 else {
6329 assert(PyUnicode_Check(rep));
6330 if (PyUnicode_READY(rep) < 0)
6331 goto error;
6332 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6333 if (!PyUnicode_IS_ASCII(rep)) {
6334 raise_encode_exception(&exc, encoding,
6335 str, pos - 1, pos,
6336 "surrogates not allowed");
6337 goto error;
6338 }
6339 }
6340
6341 /* two bytes are reserved for each surrogate */
6342 if (moreunits > 1) {
6343 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006344 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006345 /* integer overflow */
6346 PyErr_NoMemory();
6347 goto error;
6348 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006349 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006350 goto error;
6351 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6352 }
6353
6354 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006355 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006356 out += moreunits;
6357 } else /* rep is unicode */ {
6358 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6359 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6360 &out, native_ordering);
6361 }
6362
6363 Py_CLEAR(rep);
6364 }
6365
6366 /* Cut back to size actually needed. This is necessary for, for example,
6367 encoding of a string containing isolated surrogates and the 'ignore' handler
6368 is used. */
6369 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6370 if (nsize != PyBytes_GET_SIZE(v))
6371 _PyBytes_Resize(&v, nsize);
6372 Py_XDECREF(errorHandler);
6373 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006374 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006375 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006376 error:
6377 Py_XDECREF(rep);
6378 Py_XDECREF(errorHandler);
6379 Py_XDECREF(exc);
6380 Py_XDECREF(v);
6381 return NULL;
6382#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383}
6384
Alexander Belopolsky40018472011-02-26 01:02:56 +00006385PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006386PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6387 Py_ssize_t size,
6388 const char *errors,
6389 int byteorder)
6390{
6391 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006392 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006393 if (tmp == NULL)
6394 return NULL;
6395 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6396 Py_DECREF(tmp);
6397 return result;
6398}
6399
6400PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006401PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006403 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404}
6405
6406/* --- Unicode Escape Codec ----------------------------------------------- */
6407
Victor Stinner47e1afd2020-10-26 16:43:47 +01006408static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006409
Alexander Belopolsky40018472011-02-26 01:02:56 +00006410PyObject *
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006411_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
Eric V. Smith42454af2016-10-31 09:22:08 -04006412 Py_ssize_t size,
6413 const char *errors,
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006414 Py_ssize_t *consumed,
Eric V. Smith42454af2016-10-31 09:22:08 -04006415 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006418 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420 PyObject *errorHandler = NULL;
6421 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006422
Eric V. Smith42454af2016-10-31 09:22:08 -04006423 // so we can remember if we've seen an invalid escape char or not
6424 *first_invalid_escape = NULL;
6425
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 if (size == 0) {
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006427 if (consumed) {
6428 *consumed = 0;
6429 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006430 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006431 }
6432 /* Escaped strings will always be longer than the resulting
6433 Unicode string, so we start with size here and then reduce the
6434 length after conversion to the true value.
6435 (but if the error callback returns a long replacement string
6436 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006437 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 writer.min_length = size;
6439 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6440 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006441 }
6442
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 end = s + size;
6444 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006445 unsigned char c = (unsigned char) *s++;
6446 Py_UCS4 ch;
6447 int count;
6448 Py_ssize_t startinpos;
6449 Py_ssize_t endinpos;
6450 const char *message;
6451
6452#define WRITE_ASCII_CHAR(ch) \
6453 do { \
6454 assert(ch <= 127); \
6455 assert(writer.pos < writer.size); \
6456 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6457 } while(0)
6458
6459#define WRITE_CHAR(ch) \
6460 do { \
6461 if (ch <= writer.maxchar) { \
6462 assert(writer.pos < writer.size); \
6463 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6464 } \
6465 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6466 goto onError; \
6467 } \
6468 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469
6470 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 if (c != '\\') {
6472 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 continue;
6474 }
6475
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006478 if (s >= end) {
6479 message = "\\ at end of string";
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006480 goto incomplete;
Victor Stinner62ec3312016-09-06 17:04:34 -07006481 }
6482 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006483
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006485 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006488 case '\n': continue;
6489 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6490 case '\'': WRITE_ASCII_CHAR('\''); continue;
6491 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6492 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006493 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006494 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6495 case 't': WRITE_ASCII_CHAR('\t'); continue;
6496 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6497 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006498 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006500 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006501 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 case '0': case '1': case '2': case '3':
6505 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006506 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006507 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006508 ch = (ch<<3) + *s++ - '0';
6509 if (s < end && '0' <= *s && *s <= '7') {
6510 ch = (ch<<3) + *s++ - '0';
6511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006513 WRITE_CHAR(ch);
6514 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 /* hex escapes */
6517 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006519 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006520 message = "truncated \\xXX escape";
6521 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006525 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006526 message = "truncated \\uXXXX escape";
6527 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006530 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006531 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006532 message = "truncated \\UXXXXXXXX escape";
6533 hexescape:
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006534 for (ch = 0; count; ++s, --count) {
6535 if (s >= end) {
6536 goto incomplete;
6537 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006538 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006539 ch <<= 4;
6540 if (c >= '0' && c <= '9') {
6541 ch += c - '0';
6542 }
6543 else if (c >= 'a' && c <= 'f') {
6544 ch += c - ('a' - 10);
6545 }
6546 else if (c >= 'A' && c <= 'F') {
6547 ch += c - ('A' - 10);
6548 }
6549 else {
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006550 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006551 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006552 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006553
6554 /* when we get here, ch is a 32-bit unicode character */
6555 if (ch > MAX_UNICODE) {
6556 message = "illegal Unicode character";
6557 goto error;
6558 }
6559
6560 WRITE_CHAR(ch);
6561 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006562
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006564 case 'N':
Victor Stinner47e1afd2020-10-26 16:43:47 +01006565 if (ucnhash_capi == NULL) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006566 /* load the unicode data module */
Victor Stinner47e1afd2020-10-26 16:43:47 +01006567 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006568 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner47e1afd2020-10-26 16:43:47 +01006569 if (ucnhash_capi == NULL) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006570 PyErr_SetString(
6571 PyExc_UnicodeError,
6572 "\\N escapes not supported (can't load unicodedata module)"
6573 );
6574 goto onError;
6575 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006576 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006577
6578 message = "malformed \\N character escape";
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006579 if (s >= end) {
6580 goto incomplete;
6581 }
6582 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006583 const char *start = ++s;
6584 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006585 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006586 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006587 s++;
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006588 if (s >= end) {
6589 goto incomplete;
6590 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006591 namelen = s - start;
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006592 if (namelen) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006593 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006594 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006595 ch = 0xffffffff; /* in case 'getcode' messes up */
6596 if (namelen <= INT_MAX &&
Victor Stinner920cb642020-10-26 19:19:36 +01006597 ucnhash_capi->getcode(start, (int)namelen,
Victor Stinner62ec3312016-09-06 17:04:34 -07006598 &ch, 0)) {
6599 assert(ch <= MAX_UNICODE);
6600 WRITE_CHAR(ch);
6601 continue;
6602 }
6603 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006604 }
6605 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006606 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006607
6608 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006609 if (*first_invalid_escape == NULL) {
6610 *first_invalid_escape = s-1; /* Back up one char, since we've
6611 already incremented s. */
6612 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006613 WRITE_ASCII_CHAR('\\');
6614 WRITE_CHAR(c);
6615 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006617
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006618 incomplete:
6619 if (consumed) {
6620 *consumed = startinpos;
6621 break;
6622 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006623 error:
6624 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006625 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006626 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006627 errors, &errorHandler,
6628 "unicodeescape", message,
6629 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006630 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006631 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006632 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006633 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006634
6635#undef WRITE_ASCII_CHAR
6636#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006638
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006639 Py_XDECREF(errorHandler);
6640 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006641 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006642
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006644 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006645 Py_XDECREF(errorHandler);
6646 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 return NULL;
6648}
6649
Eric V. Smith42454af2016-10-31 09:22:08 -04006650PyObject *
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006651_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
Eric V. Smith42454af2016-10-31 09:22:08 -04006652 Py_ssize_t size,
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006653 const char *errors,
6654 Py_ssize_t *consumed)
Eric V. Smith42454af2016-10-31 09:22:08 -04006655{
6656 const char *first_invalid_escape;
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006657 PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6658 consumed,
Eric V. Smith42454af2016-10-31 09:22:08 -04006659 &first_invalid_escape);
6660 if (result == NULL)
6661 return NULL;
6662 if (first_invalid_escape != NULL) {
6663 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6664 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006665 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006666 Py_DECREF(result);
6667 return NULL;
6668 }
6669 }
6670 return result;
6671}
6672
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006673PyObject *
6674PyUnicode_DecodeUnicodeEscape(const char *s,
6675 Py_ssize_t size,
6676 const char *errors)
6677{
6678 return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6679}
6680
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006681/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Alexander Belopolsky40018472011-02-26 01:02:56 +00006683PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006684PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006686 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006687 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006689 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006690 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006691 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692
Ezio Melottie7f90372012-10-05 03:33:31 +03006693 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006694 escape.
6695
Ezio Melottie7f90372012-10-05 03:33:31 +03006696 For UCS1 strings it's '\xxx', 4 bytes per source character.
6697 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6698 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006699 */
6700
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006701 if (!PyUnicode_Check(unicode)) {
6702 PyErr_BadArgument();
6703 return NULL;
6704 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006705 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006706 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006707 }
Victor Stinner358af132015-10-12 22:36:57 +02006708
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006709 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006710 if (len == 0) {
6711 return PyBytes_FromStringAndSize(NULL, 0);
6712 }
6713
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006714 kind = PyUnicode_KIND(unicode);
6715 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006716 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6717 bytes, and 1 byte characters 4. */
6718 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006719 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006720 return PyErr_NoMemory();
6721 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006722 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006723 if (repr == NULL) {
6724 return NULL;
6725 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006726
Victor Stinner62ec3312016-09-06 17:04:34 -07006727 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006728 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006729 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006730
Victor Stinner62ec3312016-09-06 17:04:34 -07006731 /* U+0000-U+00ff range */
6732 if (ch < 0x100) {
6733 if (ch >= ' ' && ch < 127) {
6734 if (ch != '\\') {
6735 /* Copy printable US ASCII as-is */
6736 *p++ = (char) ch;
6737 }
6738 /* Escape backslashes */
6739 else {
6740 *p++ = '\\';
6741 *p++ = '\\';
6742 }
6743 }
Victor Stinner358af132015-10-12 22:36:57 +02006744
Victor Stinner62ec3312016-09-06 17:04:34 -07006745 /* Map special whitespace to '\t', \n', '\r' */
6746 else if (ch == '\t') {
6747 *p++ = '\\';
6748 *p++ = 't';
6749 }
6750 else if (ch == '\n') {
6751 *p++ = '\\';
6752 *p++ = 'n';
6753 }
6754 else if (ch == '\r') {
6755 *p++ = '\\';
6756 *p++ = 'r';
6757 }
6758
6759 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6760 else {
6761 *p++ = '\\';
6762 *p++ = 'x';
6763 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6764 *p++ = Py_hexdigits[ch & 0x000F];
6765 }
Tim Petersced69f82003-09-16 20:30:58 +00006766 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006767 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006768 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 *p++ = '\\';
6770 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006771 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6772 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6773 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6774 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006776 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6777 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006778
Victor Stinner62ec3312016-09-06 17:04:34 -07006779 /* Make sure that the first two digits are zero */
6780 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006781 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006782 *p++ = 'U';
6783 *p++ = '0';
6784 *p++ = '0';
6785 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6786 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6787 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6788 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6789 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6790 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793
Victor Stinner62ec3312016-09-06 17:04:34 -07006794 assert(p - PyBytes_AS_STRING(repr) > 0);
6795 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6796 return NULL;
6797 }
6798 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799}
6800
Alexander Belopolsky40018472011-02-26 01:02:56 +00006801PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006802PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6803 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006805 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006806 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006807 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006809 }
6810
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006811 result = PyUnicode_AsUnicodeEscapeString(tmp);
6812 Py_DECREF(tmp);
6813 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814}
6815
6816/* --- Raw Unicode Escape Codec ------------------------------------------- */
6817
Alexander Belopolsky40018472011-02-26 01:02:56 +00006818PyObject *
6819PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006820 Py_ssize_t size,
6821 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006823 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006824 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006826 PyObject *errorHandler = NULL;
6827 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006828
Victor Stinner62ec3312016-09-06 17:04:34 -07006829 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006830 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006831 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006832
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833 /* Escaped strings will always be longer than the resulting
6834 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006835 length after conversion to the true value. (But decoding error
6836 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006837 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006838 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006839 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6840 goto onError;
6841 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 end = s + size;
6844 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006845 unsigned char c = (unsigned char) *s++;
6846 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006847 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006848 Py_ssize_t startinpos;
6849 Py_ssize_t endinpos;
6850 const char *message;
6851
6852#define WRITE_CHAR(ch) \
6853 do { \
6854 if (ch <= writer.maxchar) { \
6855 assert(writer.pos < writer.size); \
6856 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6857 } \
6858 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6859 goto onError; \
6860 } \
6861 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006864 if (c != '\\' || s >= end) {
6865 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006867 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006868
Victor Stinner62ec3312016-09-06 17:04:34 -07006869 c = (unsigned char) *s++;
6870 if (c == 'u') {
6871 count = 4;
6872 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006874 else if (c == 'U') {
6875 count = 8;
6876 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006877 }
6878 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006879 assert(writer.pos < writer.size);
6880 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6881 WRITE_CHAR(c);
6882 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006883 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006884 startinpos = s - starts - 2;
6885
6886 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6887 for (ch = 0; count && s < end; ++s, --count) {
6888 c = (unsigned char)*s;
6889 ch <<= 4;
6890 if (c >= '0' && c <= '9') {
6891 ch += c - '0';
6892 }
6893 else if (c >= 'a' && c <= 'f') {
6894 ch += c - ('a' - 10);
6895 }
6896 else if (c >= 'A' && c <= 'F') {
6897 ch += c - ('A' - 10);
6898 }
6899 else {
6900 break;
6901 }
6902 }
6903 if (!count) {
6904 if (ch <= MAX_UNICODE) {
6905 WRITE_CHAR(ch);
6906 continue;
6907 }
6908 message = "\\Uxxxxxxxx out of range";
6909 }
6910
6911 endinpos = s-starts;
6912 writer.min_length = end - s + writer.pos;
6913 if (unicode_decode_call_errorhandler_writer(
6914 errors, &errorHandler,
6915 "rawunicodeescape", message,
6916 &starts, &end, &startinpos, &endinpos, &exc, &s,
6917 &writer)) {
6918 goto onError;
6919 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006920 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006921
6922#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006924 Py_XDECREF(errorHandler);
6925 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006926 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006927
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006929 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 Py_XDECREF(errorHandler);
6931 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006933
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934}
6935
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006936
Alexander Belopolsky40018472011-02-26 01:02:56 +00006937PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006938PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939{
Victor Stinner62ec3312016-09-06 17:04:34 -07006940 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006942 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006943 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006944 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006945 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006947 if (!PyUnicode_Check(unicode)) {
6948 PyErr_BadArgument();
6949 return NULL;
6950 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006951 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006952 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006953 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006954 kind = PyUnicode_KIND(unicode);
6955 data = PyUnicode_DATA(unicode);
6956 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006957 if (kind == PyUnicode_1BYTE_KIND) {
6958 return PyBytes_FromStringAndSize(data, len);
6959 }
Victor Stinner0e368262011-11-10 20:12:49 +01006960
Victor Stinner62ec3312016-09-06 17:04:34 -07006961 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6962 bytes, and 1 byte characters 4. */
6963 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006964
Victor Stinner62ec3312016-09-06 17:04:34 -07006965 if (len > PY_SSIZE_T_MAX / expandsize) {
6966 return PyErr_NoMemory();
6967 }
6968 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6969 if (repr == NULL) {
6970 return NULL;
6971 }
6972 if (len == 0) {
6973 return repr;
6974 }
6975
6976 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006977 for (pos = 0; pos < len; pos++) {
6978 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006979
Victor Stinner62ec3312016-09-06 17:04:34 -07006980 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6981 if (ch < 0x100) {
6982 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006983 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006984 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006985 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 *p++ = '\\';
6987 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006988 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6989 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6990 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6991 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006993 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6994 else {
6995 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6996 *p++ = '\\';
6997 *p++ = 'U';
6998 *p++ = '0';
6999 *p++ = '0';
7000 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7001 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7002 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7003 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7004 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7005 *p++ = Py_hexdigits[ch & 15];
7006 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00007008
Victor Stinner62ec3312016-09-06 17:04:34 -07007009 assert(p > PyBytes_AS_STRING(repr));
7010 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
7011 return NULL;
7012 }
7013 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014}
7015
Alexander Belopolsky40018472011-02-26 01:02:56 +00007016PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01007017PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
7018 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01007020 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007021 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01007022 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00007023 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01007024 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
7025 Py_DECREF(tmp);
7026 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027}
7028
7029/* --- Latin-1 Codec ------------------------------------------------------ */
7030
Alexander Belopolsky40018472011-02-26 01:02:56 +00007031PyObject *
7032PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007033 Py_ssize_t size,
7034 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06007037 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038}
7039
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007040/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007041static void
7042make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007043 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007044 PyObject *unicode,
7045 Py_ssize_t startpos, Py_ssize_t endpos,
7046 const char *reason)
7047{
7048 if (*exceptionObject == NULL) {
7049 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007050 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01007051 encoding, unicode, startpos, endpos, reason);
7052 }
7053 else {
7054 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7055 goto onError;
7056 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7057 goto onError;
7058 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7059 goto onError;
7060 return;
7061 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02007062 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01007063 }
7064}
7065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007066/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007067static void
7068raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007069 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007070 PyObject *unicode,
7071 Py_ssize_t startpos, Py_ssize_t endpos,
7072 const char *reason)
7073{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007074 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007075 encoding, unicode, startpos, endpos, reason);
7076 if (*exceptionObject != NULL)
7077 PyCodec_StrictErrors(*exceptionObject);
7078}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007079
7080/* error handling callback helper:
7081 build arguments, call the callback and check the arguments,
7082 put the result into newpos and return the replacement string, which
7083 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007084static PyObject *
7085unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007086 PyObject **errorHandler,
7087 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007088 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007089 Py_ssize_t startpos, Py_ssize_t endpos,
7090 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007091{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02007092 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007093 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007094 PyObject *restuple;
7095 PyObject *resunicode;
7096
7097 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007099 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007101 }
7102
Benjamin Petersonbac79492012-01-14 13:34:47 -05007103 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007104 return NULL;
7105 len = PyUnicode_GET_LENGTH(unicode);
7106
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007107 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007108 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007109 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007110 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007111
Petr Viktorinffd97532020-02-11 17:46:57 +01007112 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007113 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007115 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007116 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007117 Py_DECREF(restuple);
7118 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007119 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007120 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 &resunicode, newpos)) {
7122 Py_DECREF(restuple);
7123 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007124 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007125 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7126 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7127 Py_DECREF(restuple);
7128 return NULL;
7129 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007130 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007131 *newpos = len + *newpos;
7132 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007133 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 Py_DECREF(restuple);
7135 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007136 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007137 Py_INCREF(resunicode);
7138 Py_DECREF(restuple);
7139 return resunicode;
7140}
7141
Alexander Belopolsky40018472011-02-26 01:02:56 +00007142static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007143unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007144 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007145 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007146{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007147 /* input state */
7148 Py_ssize_t pos=0, size;
7149 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007150 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007151 /* pointer into the output */
7152 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007153 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7154 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007155 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007156 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007157 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007158 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007159 /* output object */
7160 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007161
Benjamin Petersonbac79492012-01-14 13:34:47 -05007162 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007163 return NULL;
7164 size = PyUnicode_GET_LENGTH(unicode);
7165 kind = PyUnicode_KIND(unicode);
7166 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007167 /* allocate enough for a simple encoding without
7168 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007169 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007170 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007171
7172 _PyBytesWriter_Init(&writer);
7173 str = _PyBytesWriter_Alloc(&writer, size);
7174 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007175 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007176
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007177 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007178 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007179
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007181 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007183 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007184 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007185 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007187 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007189 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007190 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007192
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007193 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007195
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007196 /* Only overallocate the buffer if it's not the last write */
7197 writer.overallocate = (collend < size);
7198
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007200 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007201 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007202
7203 switch (error_handler) {
7204 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007205 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007207
7208 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007209 memset(str, '?', collend - collstart);
7210 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007211 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007212 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007213 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 break;
Victor Stinner50149202015-09-22 00:26:54 +02007215
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007216 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007217 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007218 writer.min_size -= (collend - collstart);
7219 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007220 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007221 if (str == NULL)
7222 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007223 pos = collend;
7224 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007225
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007226 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007227 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007228 writer.min_size -= (collend - collstart);
7229 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007230 unicode, collstart, collend);
7231 if (str == NULL)
7232 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007233 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 break;
Victor Stinner50149202015-09-22 00:26:54 +02007235
Victor Stinnerc3713e92015-09-29 12:32:13 +02007236 case _Py_ERROR_SURROGATEESCAPE:
7237 for (i = collstart; i < collend; ++i) {
7238 ch = PyUnicode_READ(kind, data, i);
7239 if (ch < 0xdc80 || 0xdcff < ch) {
7240 /* Not a UTF-8b surrogate */
7241 break;
7242 }
7243 *str++ = (char)(ch - 0xdc00);
7244 ++pos;
7245 }
7246 if (i >= collend)
7247 break;
7248 collstart = pos;
7249 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007250 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007251
Benjamin Peterson29060642009-01-31 22:14:21 +00007252 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007253 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7254 encoding, reason, unicode, &exc,
7255 collstart, collend, &newpos);
7256 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007258
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007259 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007260 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007261
Victor Stinner6bd525b2015-10-09 13:10:05 +02007262 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007263 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007264 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007265 PyBytes_AS_STRING(rep),
7266 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007267 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007268 else {
7269 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007270
Victor Stinner6bd525b2015-10-09 13:10:05 +02007271 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007273
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007274 if (limit == 256 ?
7275 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7276 !PyUnicode_IS_ASCII(rep))
7277 {
7278 /* Not all characters are smaller than limit */
7279 raise_encode_exception(&exc, encoding, unicode,
7280 collstart, collend, reason);
7281 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007283 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7284 str = _PyBytesWriter_WriteBytes(&writer, str,
7285 PyUnicode_DATA(rep),
7286 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007288 if (str == NULL)
7289 goto onError;
7290
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007291 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007292 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007293 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007294
7295 /* If overallocation was disabled, ensure that it was the last
7296 write. Otherwise, we missed an optimization */
7297 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007298 }
7299 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007300
Victor Stinner50149202015-09-22 00:26:54 +02007301 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007302 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007303 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007304
7305 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007306 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007307 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007308 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007309 Py_XDECREF(exc);
7310 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007311}
7312
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007313/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007314PyObject *
7315PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007316 Py_ssize_t size,
7317 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007319 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007320 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007321 if (unicode == NULL)
7322 return NULL;
7323 result = unicode_encode_ucs1(unicode, errors, 256);
7324 Py_DECREF(unicode);
7325 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326}
7327
Alexander Belopolsky40018472011-02-26 01:02:56 +00007328PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007329_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330{
7331 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 PyErr_BadArgument();
7333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007335 if (PyUnicode_READY(unicode) == -1)
7336 return NULL;
7337 /* Fast path: if it is a one-byte string, construct
7338 bytes object directly. */
7339 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7340 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7341 PyUnicode_GET_LENGTH(unicode));
7342 /* Non-Latin-1 characters present. Defer to above function to
7343 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007344 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007345}
7346
7347PyObject*
7348PyUnicode_AsLatin1String(PyObject *unicode)
7349{
7350 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351}
7352
7353/* --- 7-bit ASCII Codec -------------------------------------------------- */
7354
Alexander Belopolsky40018472011-02-26 01:02:56 +00007355PyObject *
7356PyUnicode_DecodeASCII(const char *s,
7357 Py_ssize_t size,
7358 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007360 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007361 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007362 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007363 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007364 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007365
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007367 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007368
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007370 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007371 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007373
Inada Naoki770847a2019-06-24 12:30:24 +09007374 // Shortcut for simple case
7375 PyObject *u = PyUnicode_New(size, 127);
7376 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007377 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007378 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007379 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007380 if (outpos == size) {
7381 return u;
7382 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007383
Inada Naoki770847a2019-06-24 12:30:24 +09007384 _PyUnicodeWriter writer;
7385 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007386 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007387
Inada Naoki770847a2019-06-24 12:30:24 +09007388 s += outpos;
7389 int kind = writer.kind;
7390 void *data = writer.data;
7391 Py_ssize_t startinpos, endinpos;
7392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007393 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007394 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007396 PyUnicode_WRITE(kind, data, writer.pos, c);
7397 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007399 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007401
7402 /* byte outsize range 0x00..0x7f: call the error handler */
7403
7404 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007405 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007406
7407 switch (error_handler)
7408 {
7409 case _Py_ERROR_REPLACE:
7410 case _Py_ERROR_SURROGATEESCAPE:
7411 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007412 but we may switch to UCS2 at the first write */
7413 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7414 goto onError;
7415 kind = writer.kind;
7416 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007417
7418 if (error_handler == _Py_ERROR_REPLACE)
7419 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7420 else
7421 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7422 writer.pos++;
7423 ++s;
7424 break;
7425
7426 case _Py_ERROR_IGNORE:
7427 ++s;
7428 break;
7429
7430 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 startinpos = s-starts;
7432 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007433 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007434 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 "ascii", "ordinal not in range(128)",
7436 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007437 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007439 kind = writer.kind;
7440 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007443 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007444 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007445 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007446
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007448 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007449 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007450 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 return NULL;
7452}
7453
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007454/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007455PyObject *
7456PyUnicode_EncodeASCII(const Py_UNICODE *p,
7457 Py_ssize_t size,
7458 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007460 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007461 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007462 if (unicode == NULL)
7463 return NULL;
7464 result = unicode_encode_ucs1(unicode, errors, 128);
7465 Py_DECREF(unicode);
7466 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467}
7468
Alexander Belopolsky40018472011-02-26 01:02:56 +00007469PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007470_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471{
7472 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 PyErr_BadArgument();
7474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007476 if (PyUnicode_READY(unicode) == -1)
7477 return NULL;
7478 /* Fast path: if it is an ASCII-only string, construct bytes object
7479 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007480 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007481 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7482 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007483 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007484}
7485
7486PyObject *
7487PyUnicode_AsASCIIString(PyObject *unicode)
7488{
7489 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490}
7491
Steve Dowercc16be82016-09-08 10:35:16 -07007492#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007493
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007494/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007495
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007496#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007497#define NEED_RETRY
7498#endif
7499
Steve Dower7ebdda02019-08-21 16:22:33 -07007500/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
Christian Claussdcfbe4f2021-10-07 16:31:33 +02007501 transcoding from UTF-16), but INT_MAX / 4 performs better in
Steve Dower7ebdda02019-08-21 16:22:33 -07007502 both cases also and avoids partial characters overrunning the
7503 length limit in MultiByteToWideChar on Windows */
7504#define DECODING_CHUNK_SIZE (INT_MAX/4)
7505
Victor Stinner3a50e702011-10-18 21:21:00 +02007506#ifndef WC_ERR_INVALID_CHARS
7507# define WC_ERR_INVALID_CHARS 0x0080
7508#endif
7509
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007510static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007511code_page_name(UINT code_page, PyObject **obj)
7512{
7513 *obj = NULL;
7514 if (code_page == CP_ACP)
7515 return "mbcs";
7516 if (code_page == CP_UTF7)
7517 return "CP_UTF7";
7518 if (code_page == CP_UTF8)
7519 return "CP_UTF8";
7520
7521 *obj = PyBytes_FromFormat("cp%u", code_page);
7522 if (*obj == NULL)
7523 return NULL;
7524 return PyBytes_AS_STRING(*obj);
7525}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007526
Victor Stinner3a50e702011-10-18 21:21:00 +02007527static DWORD
7528decode_code_page_flags(UINT code_page)
7529{
7530 if (code_page == CP_UTF7) {
7531 /* The CP_UTF7 decoder only supports flags=0 */
7532 return 0;
7533 }
7534 else
7535 return MB_ERR_INVALID_CHARS;
7536}
7537
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007538/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007539 * Decode a byte string from a Windows code page into unicode object in strict
7540 * mode.
7541 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007542 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7543 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007544 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007545static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007546decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007547 wchar_t **buf,
7548 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 const char *in,
7550 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007551{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007552 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007553 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007555
7556 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007557 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007558 while ((outsize = MultiByteToWideChar(code_page, flags,
7559 in, insize, NULL, 0)) <= 0)
7560 {
7561 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7562 goto error;
7563 }
7564 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7565 flags = 0;
7566 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007567
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007568 /* Extend a wchar_t* buffer */
7569 Py_ssize_t n = *bufsize; /* Get the current length */
7570 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7571 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007572 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007573 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007574
7575 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007576 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7577 if (outsize <= 0)
7578 goto error;
7579 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007580
Victor Stinner3a50e702011-10-18 21:21:00 +02007581error:
7582 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7583 return -2;
7584 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007585 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007586}
7587
Victor Stinner3a50e702011-10-18 21:21:00 +02007588/*
7589 * Decode a byte string from a code page into unicode object with an error
7590 * handler.
7591 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007592 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 * UnicodeDecodeError exception and returns -1 on error.
7594 */
7595static int
7596decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007597 wchar_t **buf,
7598 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007599 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007600 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007601{
7602 const char *startin = in;
7603 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007604 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 /* Ideally, we should get reason from FormatMessage. This is the Windows
7606 2000 English version of the message. */
7607 const char *reason = "No mapping for the Unicode character exists "
7608 "in the target code page.";
7609 /* each step cannot decode more than 1 character, but a character can be
7610 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007611 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007612 int insize;
7613 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 PyObject *errorHandler = NULL;
7615 PyObject *exc = NULL;
7616 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007617 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007618 DWORD err;
7619 int ret = -1;
7620
7621 assert(size > 0);
7622
7623 encoding = code_page_name(code_page, &encoding_obj);
7624 if (encoding == NULL)
7625 return -1;
7626
Victor Stinner7d00cc12014-03-17 23:08:06 +01007627 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7629 UnicodeDecodeError. */
7630 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7631 if (exc != NULL) {
7632 PyCodec_StrictErrors(exc);
7633 Py_CLEAR(exc);
7634 }
7635 goto error;
7636 }
7637
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007638 /* Extend a wchar_t* buffer */
7639 Py_ssize_t n = *bufsize; /* Get the current length */
7640 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7641 PyErr_NoMemory();
7642 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007643 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007644 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7645 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007647 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007648
7649 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 while (in < endin)
7651 {
7652 /* Decode a character */
7653 insize = 1;
7654 do
7655 {
7656 outsize = MultiByteToWideChar(code_page, flags,
7657 in, insize,
7658 buffer, Py_ARRAY_LENGTH(buffer));
7659 if (outsize > 0)
7660 break;
7661 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007662 if (err == ERROR_INVALID_FLAGS && flags) {
7663 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7664 flags = 0;
7665 continue;
7666 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 if (err != ERROR_NO_UNICODE_TRANSLATION
7668 && err != ERROR_INSUFFICIENT_BUFFER)
7669 {
7670 PyErr_SetFromWindowsErr(0);
7671 goto error;
7672 }
7673 insize++;
7674 }
7675 /* 4=maximum length of a UTF-8 sequence */
7676 while (insize <= 4 && (in + insize) <= endin);
7677
7678 if (outsize <= 0) {
7679 Py_ssize_t startinpos, endinpos, outpos;
7680
Victor Stinner7d00cc12014-03-17 23:08:06 +01007681 /* last character in partial decode? */
7682 if (in + insize >= endin && !final)
7683 break;
7684
Victor Stinner3a50e702011-10-18 21:21:00 +02007685 startinpos = in - startin;
7686 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007687 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007688 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007689 errors, &errorHandler,
7690 encoding, reason,
7691 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007692 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007693 {
7694 goto error;
7695 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007696 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007697 }
7698 else {
7699 in += insize;
7700 memcpy(out, buffer, outsize * sizeof(wchar_t));
7701 out += outsize;
7702 }
7703 }
7704
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007705 /* Shrink the buffer */
7706 assert(out - *buf <= *bufsize);
7707 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007708 /* (in - startin) <= size and size is an int */
7709 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007710
7711error:
7712 Py_XDECREF(encoding_obj);
7713 Py_XDECREF(errorHandler);
7714 Py_XDECREF(exc);
7715 return ret;
7716}
7717
Victor Stinner3a50e702011-10-18 21:21:00 +02007718static PyObject *
7719decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007720 const char *s, Py_ssize_t size,
7721 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007722{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007723 wchar_t *buf = NULL;
7724 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007725 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007726
Victor Stinner3a50e702011-10-18 21:21:00 +02007727 if (code_page < 0) {
7728 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7729 return NULL;
7730 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007731 if (size < 0) {
7732 PyErr_BadInternalCall();
7733 return NULL;
7734 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007735
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007736 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007738
Victor Stinner76a31a62011-11-04 00:05:13 +01007739 do
7740 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007741#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007742 if (size > DECODING_CHUNK_SIZE) {
7743 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007744 final = 0;
7745 done = 0;
7746 }
7747 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007748#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007749 {
7750 chunk_size = (int)size;
7751 final = (consumed == NULL);
7752 done = 1;
7753 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007754
Victor Stinner76a31a62011-11-04 00:05:13 +01007755 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007756 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007757 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007758 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007759 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007760
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007761 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007762 s, chunk_size);
7763 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007764 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007765 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007766 errors, final);
7767 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007768
7769 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007770 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007771 return NULL;
7772 }
7773
7774 if (consumed)
7775 *consumed += converted;
7776
7777 s += converted;
7778 size -= converted;
7779 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007780
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007781 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7782 PyMem_Free(buf);
7783 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007784}
7785
Alexander Belopolsky40018472011-02-26 01:02:56 +00007786PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007787PyUnicode_DecodeCodePageStateful(int code_page,
7788 const char *s,
7789 Py_ssize_t size,
7790 const char *errors,
7791 Py_ssize_t *consumed)
7792{
7793 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7794}
7795
7796PyObject *
7797PyUnicode_DecodeMBCSStateful(const char *s,
7798 Py_ssize_t size,
7799 const char *errors,
7800 Py_ssize_t *consumed)
7801{
7802 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7803}
7804
7805PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007806PyUnicode_DecodeMBCS(const char *s,
7807 Py_ssize_t size,
7808 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007809{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007810 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7811}
7812
Victor Stinner3a50e702011-10-18 21:21:00 +02007813static DWORD
7814encode_code_page_flags(UINT code_page, const char *errors)
7815{
7816 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007817 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007818 }
7819 else if (code_page == CP_UTF7) {
7820 /* CP_UTF7 only supports flags=0 */
7821 return 0;
7822 }
7823 else {
7824 if (errors != NULL && strcmp(errors, "replace") == 0)
7825 return 0;
7826 else
7827 return WC_NO_BEST_FIT_CHARS;
7828 }
7829}
7830
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007831/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007832 * Encode a Unicode string to a Windows code page into a byte string in strict
7833 * mode.
7834 *
7835 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007836 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007837 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007838static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007839encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007840 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007841 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007842{
Victor Stinner554f3f02010-06-16 23:33:54 +00007843 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007844 BOOL *pusedDefaultChar = &usedDefaultChar;
7845 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007846 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007847 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007848 const DWORD flags = encode_code_page_flags(code_page, NULL);
7849 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007850 /* Create a substring so that we can get the UTF-16 representation
7851 of just the slice under consideration. */
7852 PyObject *substring;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007853 int ret = -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007854
Martin v. Löwis3d325192011-11-04 18:23:06 +01007855 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007856
Victor Stinner3a50e702011-10-18 21:21:00 +02007857 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007858 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007859 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007860 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007861
Victor Stinner2fc507f2011-11-04 20:06:39 +01007862 substring = PyUnicode_Substring(unicode, offset, offset+len);
7863 if (substring == NULL)
7864 return -1;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007865#if USE_UNICODE_WCHAR_CACHE
7866_Py_COMP_DIAG_PUSH
7867_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner2fc507f2011-11-04 20:06:39 +01007868 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7869 if (p == NULL) {
7870 Py_DECREF(substring);
7871 return -1;
7872 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007873_Py_COMP_DIAG_POP
7874#else /* USE_UNICODE_WCHAR_CACHE */
7875 p = PyUnicode_AsWideCharString(substring, &size);
7876 Py_CLEAR(substring);
7877 if (p == NULL) {
7878 return -1;
7879 }
7880#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinner9f067f42013-06-05 00:21:31 +02007881 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007882
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007883 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007884 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007885 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007886 NULL, 0,
7887 NULL, pusedDefaultChar);
7888 if (outsize <= 0)
7889 goto error;
7890 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007891 if (pusedDefaultChar && *pusedDefaultChar) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007892 ret = -2;
7893 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007894 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007895
Victor Stinner3a50e702011-10-18 21:21:00 +02007896 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007898 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007899 if (*outbytes == NULL) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007900 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007901 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007902 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007903 }
7904 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007906 const Py_ssize_t n = PyBytes_Size(*outbytes);
7907 if (outsize > PY_SSIZE_T_MAX - n) {
7908 PyErr_NoMemory();
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007909 goto done;
Victor Stinner3a50e702011-10-18 21:21:00 +02007910 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007911 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007912 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007913 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007914 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007915 }
7916
7917 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007918 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007919 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007920 out, outsize,
7921 NULL, pusedDefaultChar);
7922 if (outsize <= 0)
7923 goto error;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007924 if (pusedDefaultChar && *pusedDefaultChar) {
7925 ret = -2;
7926 goto done;
7927 }
7928 ret = 0;
7929
7930done:
7931#if USE_UNICODE_WCHAR_CACHE
7932 Py_DECREF(substring);
7933#else /* USE_UNICODE_WCHAR_CACHE */
7934 PyMem_Free(p);
7935#endif /* USE_UNICODE_WCHAR_CACHE */
7936 return ret;
Victor Stinner554f3f02010-06-16 23:33:54 +00007937
Victor Stinner3a50e702011-10-18 21:21:00 +02007938error:
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007939 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7940 ret = -2;
7941 goto done;
7942 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007943 PyErr_SetFromWindowsErr(0);
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007944 goto done;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007945}
7946
Victor Stinner3a50e702011-10-18 21:21:00 +02007947/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007948 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007949 * error handler.
7950 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007951 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007952 * -1 on other error.
7953 */
7954static int
7955encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007956 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007957 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007958{
Victor Stinner3a50e702011-10-18 21:21:00 +02007959 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007960 Py_ssize_t pos = unicode_offset;
7961 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007962 /* Ideally, we should get reason from FormatMessage. This is the Windows
7963 2000 English version of the message. */
7964 const char *reason = "invalid character";
7965 /* 4=maximum length of a UTF-8 sequence */
7966 char buffer[4];
7967 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7968 Py_ssize_t outsize;
7969 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007970 PyObject *errorHandler = NULL;
7971 PyObject *exc = NULL;
7972 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007973 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007974 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007975 PyObject *rep;
7976 int ret = -1;
7977
7978 assert(insize > 0);
7979
7980 encoding = code_page_name(code_page, &encoding_obj);
7981 if (encoding == NULL)
7982 return -1;
7983
7984 if (errors == NULL || strcmp(errors, "strict") == 0) {
7985 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7986 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007987 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007988 if (exc != NULL) {
7989 PyCodec_StrictErrors(exc);
7990 Py_DECREF(exc);
7991 }
7992 Py_XDECREF(encoding_obj);
7993 return -1;
7994 }
7995
7996 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7997 pusedDefaultChar = &usedDefaultChar;
7998 else
7999 pusedDefaultChar = NULL;
8000
8001 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8002 PyErr_NoMemory();
8003 goto error;
8004 }
8005 outsize = insize * Py_ARRAY_LENGTH(buffer);
8006
8007 if (*outbytes == NULL) {
8008 /* Create string object */
8009 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8010 if (*outbytes == NULL)
8011 goto error;
8012 out = PyBytes_AS_STRING(*outbytes);
8013 }
8014 else {
8015 /* Extend string object */
8016 Py_ssize_t n = PyBytes_Size(*outbytes);
8017 if (n > PY_SSIZE_T_MAX - outsize) {
8018 PyErr_NoMemory();
8019 goto error;
8020 }
8021 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
8022 goto error;
8023 out = PyBytes_AS_STRING(*outbytes) + n;
8024 }
8025
8026 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01008027 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02008028 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01008029 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8030 wchar_t chars[2];
8031 int charsize;
8032 if (ch < 0x10000) {
8033 chars[0] = (wchar_t)ch;
8034 charsize = 1;
8035 }
8036 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01008037 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8038 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01008039 charsize = 2;
8040 }
8041
Victor Stinner3a50e702011-10-18 21:21:00 +02008042 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008043 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02008044 buffer, Py_ARRAY_LENGTH(buffer),
8045 NULL, pusedDefaultChar);
8046 if (outsize > 0) {
8047 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8048 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008049 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02008050 memcpy(out, buffer, outsize);
8051 out += outsize;
8052 continue;
8053 }
8054 }
8055 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8056 PyErr_SetFromWindowsErr(0);
8057 goto error;
8058 }
8059
Victor Stinner3a50e702011-10-18 21:21:00 +02008060 rep = unicode_encode_call_errorhandler(
8061 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01008062 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008063 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02008064 if (rep == NULL)
8065 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008066 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02008067
8068 if (PyBytes_Check(rep)) {
8069 outsize = PyBytes_GET_SIZE(rep);
8070 if (outsize != 1) {
8071 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8072 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8073 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8074 Py_DECREF(rep);
8075 goto error;
8076 }
8077 out = PyBytes_AS_STRING(*outbytes) + offset;
8078 }
8079 memcpy(out, PyBytes_AS_STRING(rep), outsize);
8080 out += outsize;
8081 }
8082 else {
8083 Py_ssize_t i;
8084 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008085 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02008086
Benjamin Petersonbac79492012-01-14 13:34:47 -05008087 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02008088 Py_DECREF(rep);
8089 goto error;
8090 }
8091
8092 outsize = PyUnicode_GET_LENGTH(rep);
8093 if (outsize != 1) {
8094 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8095 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8096 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8097 Py_DECREF(rep);
8098 goto error;
8099 }
8100 out = PyBytes_AS_STRING(*outbytes) + offset;
8101 }
8102 kind = PyUnicode_KIND(rep);
8103 data = PyUnicode_DATA(rep);
8104 for (i=0; i < outsize; i++) {
8105 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8106 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008107 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008108 encoding, unicode,
8109 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02008110 "unable to encode error handler result to ASCII");
8111 Py_DECREF(rep);
8112 goto error;
8113 }
8114 *out = (unsigned char)ch;
8115 out++;
8116 }
8117 }
8118 Py_DECREF(rep);
8119 }
8120 /* write a NUL byte */
8121 *out = 0;
8122 outsize = out - PyBytes_AS_STRING(*outbytes);
8123 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8124 if (_PyBytes_Resize(outbytes, outsize) < 0)
8125 goto error;
8126 ret = 0;
8127
8128error:
8129 Py_XDECREF(encoding_obj);
8130 Py_XDECREF(errorHandler);
8131 Py_XDECREF(exc);
8132 return ret;
8133}
8134
Victor Stinner3a50e702011-10-18 21:21:00 +02008135static PyObject *
8136encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008137 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008138 const char *errors)
8139{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008140 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008141 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008142 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008143 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008144
Victor Stinner29dacf22015-01-26 16:41:32 +01008145 if (!PyUnicode_Check(unicode)) {
8146 PyErr_BadArgument();
8147 return NULL;
8148 }
8149
Benjamin Petersonbac79492012-01-14 13:34:47 -05008150 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008151 return NULL;
8152 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008153
Victor Stinner3a50e702011-10-18 21:21:00 +02008154 if (code_page < 0) {
8155 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8156 return NULL;
8157 }
8158
Martin v. Löwis3d325192011-11-04 18:23:06 +01008159 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008160 return PyBytes_FromStringAndSize(NULL, 0);
8161
Victor Stinner7581cef2011-11-03 22:32:33 +01008162 offset = 0;
8163 do
8164 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008165#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008166 if (len > DECODING_CHUNK_SIZE) {
8167 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008168 done = 0;
8169 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008170 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008171#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008172 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008173 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008174 done = 1;
8175 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008176
Victor Stinner76a31a62011-11-04 00:05:13 +01008177 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008178 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008179 errors);
8180 if (ret == -2)
8181 ret = encode_code_page_errors(code_page, &outbytes,
8182 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008183 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008184 if (ret < 0) {
8185 Py_XDECREF(outbytes);
8186 return NULL;
8187 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008188
Victor Stinner7581cef2011-11-03 22:32:33 +01008189 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008190 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008191 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008192
Victor Stinner3a50e702011-10-18 21:21:00 +02008193 return outbytes;
8194}
8195
8196PyObject *
8197PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8198 Py_ssize_t size,
8199 const char *errors)
8200{
Victor Stinner7581cef2011-11-03 22:32:33 +01008201 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008202 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008203 if (unicode == NULL)
8204 return NULL;
8205 res = encode_code_page(CP_ACP, unicode, errors);
8206 Py_DECREF(unicode);
8207 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008208}
8209
8210PyObject *
8211PyUnicode_EncodeCodePage(int code_page,
8212 PyObject *unicode,
8213 const char *errors)
8214{
Victor Stinner7581cef2011-11-03 22:32:33 +01008215 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008216}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008217
Alexander Belopolsky40018472011-02-26 01:02:56 +00008218PyObject *
8219PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008220{
Victor Stinner7581cef2011-11-03 22:32:33 +01008221 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008222}
8223
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008224#undef NEED_RETRY
8225
Steve Dowercc16be82016-09-08 10:35:16 -07008226#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008227
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228/* --- Character Mapping Codec -------------------------------------------- */
8229
Victor Stinnerfb161b12013-04-18 01:44:27 +02008230static int
8231charmap_decode_string(const char *s,
8232 Py_ssize_t size,
8233 PyObject *mapping,
8234 const char *errors,
8235 _PyUnicodeWriter *writer)
8236{
8237 const char *starts = s;
8238 const char *e;
8239 Py_ssize_t startinpos, endinpos;
8240 PyObject *errorHandler = NULL, *exc = NULL;
8241 Py_ssize_t maplen;
8242 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008243 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008244 Py_UCS4 x;
8245 unsigned char ch;
8246
8247 if (PyUnicode_READY(mapping) == -1)
8248 return -1;
8249
8250 maplen = PyUnicode_GET_LENGTH(mapping);
8251 mapdata = PyUnicode_DATA(mapping);
8252 mapkind = PyUnicode_KIND(mapping);
8253
8254 e = s + size;
8255
8256 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8257 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8258 * is disabled in encoding aliases, latin1 is preferred because
8259 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008260 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008261 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8262 Py_UCS4 maxchar = writer->maxchar;
8263
8264 assert (writer->kind == PyUnicode_1BYTE_KIND);
8265 while (s < e) {
8266 ch = *s;
8267 x = mapdata_ucs1[ch];
8268 if (x > maxchar) {
8269 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8270 goto onError;
8271 maxchar = writer->maxchar;
8272 outdata = (Py_UCS1 *)writer->data;
8273 }
8274 outdata[writer->pos] = x;
8275 writer->pos++;
8276 ++s;
8277 }
8278 return 0;
8279 }
8280
8281 while (s < e) {
8282 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8283 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008284 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008285 if (outkind == PyUnicode_1BYTE_KIND) {
8286 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8287 Py_UCS4 maxchar = writer->maxchar;
8288 while (s < e) {
8289 ch = *s;
8290 x = mapdata_ucs2[ch];
8291 if (x > maxchar)
8292 goto Error;
8293 outdata[writer->pos] = x;
8294 writer->pos++;
8295 ++s;
8296 }
8297 break;
8298 }
8299 else if (outkind == PyUnicode_2BYTE_KIND) {
8300 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8301 while (s < e) {
8302 ch = *s;
8303 x = mapdata_ucs2[ch];
8304 if (x == 0xFFFE)
8305 goto Error;
8306 outdata[writer->pos] = x;
8307 writer->pos++;
8308 ++s;
8309 }
8310 break;
8311 }
8312 }
8313 ch = *s;
8314
8315 if (ch < maplen)
8316 x = PyUnicode_READ(mapkind, mapdata, ch);
8317 else
8318 x = 0xfffe; /* invalid value */
8319Error:
8320 if (x == 0xfffe)
8321 {
8322 /* undefined mapping */
8323 startinpos = s-starts;
8324 endinpos = startinpos+1;
8325 if (unicode_decode_call_errorhandler_writer(
8326 errors, &errorHandler,
8327 "charmap", "character maps to <undefined>",
8328 &starts, &e, &startinpos, &endinpos, &exc, &s,
8329 writer)) {
8330 goto onError;
8331 }
8332 continue;
8333 }
8334
8335 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8336 goto onError;
8337 ++s;
8338 }
8339 Py_XDECREF(errorHandler);
8340 Py_XDECREF(exc);
8341 return 0;
8342
8343onError:
8344 Py_XDECREF(errorHandler);
8345 Py_XDECREF(exc);
8346 return -1;
8347}
8348
8349static int
8350charmap_decode_mapping(const char *s,
8351 Py_ssize_t size,
8352 PyObject *mapping,
8353 const char *errors,
8354 _PyUnicodeWriter *writer)
8355{
8356 const char *starts = s;
8357 const char *e;
8358 Py_ssize_t startinpos, endinpos;
8359 PyObject *errorHandler = NULL, *exc = NULL;
8360 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008361 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008362
8363 e = s + size;
8364
8365 while (s < e) {
8366 ch = *s;
8367
8368 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8369 key = PyLong_FromLong((long)ch);
8370 if (key == NULL)
8371 goto onError;
8372
8373 item = PyObject_GetItem(mapping, key);
8374 Py_DECREF(key);
8375 if (item == NULL) {
8376 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8377 /* No mapping found means: mapping is undefined. */
8378 PyErr_Clear();
8379 goto Undefined;
8380 } else
8381 goto onError;
8382 }
8383
8384 /* Apply mapping */
8385 if (item == Py_None)
8386 goto Undefined;
8387 if (PyLong_Check(item)) {
8388 long value = PyLong_AS_LONG(item);
8389 if (value == 0xFFFE)
8390 goto Undefined;
8391 if (value < 0 || value > MAX_UNICODE) {
8392 PyErr_Format(PyExc_TypeError,
Max Bernstein36353882020-10-17 13:38:21 -07008393 "character mapping must be in range(0x%x)",
Victor Stinnerfb161b12013-04-18 01:44:27 +02008394 (unsigned long)MAX_UNICODE + 1);
8395 goto onError;
8396 }
8397
8398 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8399 goto onError;
8400 }
8401 else if (PyUnicode_Check(item)) {
8402 if (PyUnicode_READY(item) == -1)
8403 goto onError;
8404 if (PyUnicode_GET_LENGTH(item) == 1) {
8405 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8406 if (value == 0xFFFE)
8407 goto Undefined;
8408 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8409 goto onError;
8410 }
8411 else {
8412 writer->overallocate = 1;
8413 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8414 goto onError;
8415 }
8416 }
8417 else {
8418 /* wrong return value */
8419 PyErr_SetString(PyExc_TypeError,
8420 "character mapping must return integer, None or str");
8421 goto onError;
8422 }
8423 Py_CLEAR(item);
8424 ++s;
8425 continue;
8426
8427Undefined:
8428 /* undefined mapping */
8429 Py_CLEAR(item);
8430 startinpos = s-starts;
8431 endinpos = startinpos+1;
8432 if (unicode_decode_call_errorhandler_writer(
8433 errors, &errorHandler,
8434 "charmap", "character maps to <undefined>",
8435 &starts, &e, &startinpos, &endinpos, &exc, &s,
8436 writer)) {
8437 goto onError;
8438 }
8439 }
8440 Py_XDECREF(errorHandler);
8441 Py_XDECREF(exc);
8442 return 0;
8443
8444onError:
8445 Py_XDECREF(item);
8446 Py_XDECREF(errorHandler);
8447 Py_XDECREF(exc);
8448 return -1;
8449}
8450
Alexander Belopolsky40018472011-02-26 01:02:56 +00008451PyObject *
8452PyUnicode_DecodeCharmap(const char *s,
8453 Py_ssize_t size,
8454 PyObject *mapping,
8455 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008457 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008458
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459 /* Default to Latin-1 */
8460 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008464 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008465 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008466 writer.min_length = size;
8467 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008469
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008470 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008471 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8472 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008473 }
8474 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008475 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8476 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008478 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008479
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008481 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 return NULL;
8483}
8484
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008485/* Charmap encoding: the lookup table */
8486
Alexander Belopolsky40018472011-02-26 01:02:56 +00008487struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 PyObject_HEAD
8489 unsigned char level1[32];
8490 int count2, count3;
8491 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008492};
8493
8494static PyObject*
8495encoding_map_size(PyObject *obj, PyObject* args)
8496{
8497 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008498 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008500}
8501
8502static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008503 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 PyDoc_STR("Return the size (in bytes) of this object") },
8505 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008506};
8507
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008508static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008509 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 "EncodingMap", /*tp_name*/
8511 sizeof(struct encoding_map), /*tp_basicsize*/
8512 0, /*tp_itemsize*/
8513 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008514 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008515 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 0, /*tp_getattr*/
8517 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008518 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 0, /*tp_repr*/
8520 0, /*tp_as_number*/
8521 0, /*tp_as_sequence*/
8522 0, /*tp_as_mapping*/
8523 0, /*tp_hash*/
8524 0, /*tp_call*/
8525 0, /*tp_str*/
8526 0, /*tp_getattro*/
8527 0, /*tp_setattro*/
8528 0, /*tp_as_buffer*/
8529 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8530 0, /*tp_doc*/
8531 0, /*tp_traverse*/
8532 0, /*tp_clear*/
8533 0, /*tp_richcompare*/
8534 0, /*tp_weaklistoffset*/
8535 0, /*tp_iter*/
8536 0, /*tp_iternext*/
8537 encoding_map_methods, /*tp_methods*/
8538 0, /*tp_members*/
8539 0, /*tp_getset*/
8540 0, /*tp_base*/
8541 0, /*tp_dict*/
8542 0, /*tp_descr_get*/
8543 0, /*tp_descr_set*/
8544 0, /*tp_dictoffset*/
8545 0, /*tp_init*/
8546 0, /*tp_alloc*/
8547 0, /*tp_new*/
8548 0, /*tp_free*/
8549 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008550};
8551
8552PyObject*
8553PyUnicode_BuildEncodingMap(PyObject* string)
8554{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008555 PyObject *result;
8556 struct encoding_map *mresult;
8557 int i;
8558 int need_dict = 0;
8559 unsigned char level1[32];
8560 unsigned char level2[512];
8561 unsigned char *mlevel1, *mlevel2, *mlevel3;
8562 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008564 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008565 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008567
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008568 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008569 PyErr_BadArgument();
8570 return NULL;
8571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 kind = PyUnicode_KIND(string);
8573 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008574 length = PyUnicode_GET_LENGTH(string);
8575 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008576 memset(level1, 0xFF, sizeof level1);
8577 memset(level2, 0xFF, sizeof level2);
8578
8579 /* If there isn't a one-to-one mapping of NULL to \0,
8580 or if there are non-BMP characters, we need to use
8581 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008583 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008584 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008585 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 ch = PyUnicode_READ(kind, data, i);
8587 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008588 need_dict = 1;
8589 break;
8590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008592 /* unmapped character */
8593 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 l1 = ch >> 11;
8595 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008596 if (level1[l1] == 0xFF)
8597 level1[l1] = count2++;
8598 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008599 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008600 }
8601
8602 if (count2 >= 0xFF || count3 >= 0xFF)
8603 need_dict = 1;
8604
8605 if (need_dict) {
8606 PyObject *result = PyDict_New();
8607 PyObject *key, *value;
8608 if (!result)
8609 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008610 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008612 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008613 if (!key || !value)
8614 goto failed1;
8615 if (PyDict_SetItem(result, key, value) == -1)
8616 goto failed1;
8617 Py_DECREF(key);
8618 Py_DECREF(value);
8619 }
8620 return result;
8621 failed1:
8622 Py_XDECREF(key);
8623 Py_XDECREF(value);
8624 Py_DECREF(result);
8625 return NULL;
8626 }
8627
8628 /* Create a three-level trie */
Victor Stinner32bd68c2020-12-01 10:37:39 +01008629 result = PyObject_Malloc(sizeof(struct encoding_map) +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008630 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008631 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008632 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008633 }
8634
8635 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008636 mresult = (struct encoding_map*)result;
8637 mresult->count2 = count2;
8638 mresult->count3 = count3;
8639 mlevel1 = mresult->level1;
8640 mlevel2 = mresult->level23;
8641 mlevel3 = mresult->level23 + 16*count2;
8642 memcpy(mlevel1, level1, 32);
8643 memset(mlevel2, 0xFF, 16*count2);
8644 memset(mlevel3, 0, 128*count3);
8645 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008646 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008647 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008648 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8649 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008650 /* unmapped character */
8651 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008652 o1 = ch>>11;
8653 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008654 i2 = 16*mlevel1[o1] + o2;
8655 if (mlevel2[i2] == 0xFF)
8656 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008657 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008658 i3 = 128*mlevel2[i2] + o3;
8659 mlevel3[i3] = i;
8660 }
8661 return result;
8662}
8663
8664static int
Victor Stinner22168992011-11-20 17:09:18 +01008665encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008666{
8667 struct encoding_map *map = (struct encoding_map*)mapping;
8668 int l1 = c>>11;
8669 int l2 = (c>>7) & 0xF;
8670 int l3 = c & 0x7F;
8671 int i;
8672
Victor Stinner22168992011-11-20 17:09:18 +01008673 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008675 if (c == 0)
8676 return 0;
8677 /* level 1*/
8678 i = map->level1[l1];
8679 if (i == 0xFF) {
8680 return -1;
8681 }
8682 /* level 2*/
8683 i = map->level23[16*i+l2];
8684 if (i == 0xFF) {
8685 return -1;
8686 }
8687 /* level 3 */
8688 i = map->level23[16*map->count2 + 128*i + l3];
8689 if (i == 0) {
8690 return -1;
8691 }
8692 return i;
8693}
8694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695/* Lookup the character ch in the mapping. If the character
8696 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008697 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008698static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008699charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700{
Christian Heimes217cfd12007-12-02 14:31:20 +00008701 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 PyObject *x;
8703
8704 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 x = PyObject_GetItem(mapping, w);
8707 Py_DECREF(w);
8708 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8710 /* No mapping found means: mapping is undefined. */
8711 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008712 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 } else
8714 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008716 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008718 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 long value = PyLong_AS_LONG(x);
8720 if (value < 0 || value > 255) {
8721 PyErr_SetString(PyExc_TypeError,
8722 "character mapping must be in range(256)");
8723 Py_DECREF(x);
8724 return NULL;
8725 }
8726 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008728 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 /* wrong return value */
8732 PyErr_Format(PyExc_TypeError,
8733 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008734 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 Py_DECREF(x);
8736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 }
8738}
8739
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008740static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008741charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008742{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008743 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8744 /* exponentially overallocate to minimize reallocations */
8745 if (requiredsize < 2*outsize)
8746 requiredsize = 2*outsize;
8747 if (_PyBytes_Resize(outobj, requiredsize))
8748 return -1;
8749 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008750}
8751
Benjamin Peterson14339b62009-01-31 16:36:08 +00008752typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008754} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008756 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757 space is available. Return a new reference to the object that
8758 was put in the output buffer, or Py_None, if the mapping was undefined
8759 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008760 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008761static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008762charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008763 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008765 PyObject *rep;
8766 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008767 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768
Andy Lesterdffe4c02020-03-04 07:15:20 -06008769 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008770 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008772 if (res == -1)
8773 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 if (outsize<requiredsize)
8775 if (charmapencode_resize(outobj, outpos, requiredsize))
8776 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008777 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 outstart[(*outpos)++] = (char)res;
8779 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008780 }
8781
8782 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008785 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 Py_DECREF(rep);
8787 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008788 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 if (PyLong_Check(rep)) {
8790 Py_ssize_t requiredsize = *outpos+1;
8791 if (outsize<requiredsize)
8792 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8793 Py_DECREF(rep);
8794 return enc_EXCEPTION;
8795 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008796 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008798 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008799 else {
8800 const char *repchars = PyBytes_AS_STRING(rep);
8801 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8802 Py_ssize_t requiredsize = *outpos+repsize;
8803 if (outsize<requiredsize)
8804 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8805 Py_DECREF(rep);
8806 return enc_EXCEPTION;
8807 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008808 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 memcpy(outstart + *outpos, repchars, repsize);
8810 *outpos += repsize;
8811 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008812 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008813 Py_DECREF(rep);
8814 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008815}
8816
8817/* handle an error in PyUnicode_EncodeCharmap
8818 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008819static int
8820charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008821 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008822 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008823 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008824 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008825{
8826 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008827 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008828 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008829 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008830 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008831 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008832 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008833 Py_ssize_t collstartpos = *inpos;
8834 Py_ssize_t collendpos = *inpos+1;
8835 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008836 const char *encoding = "charmap";
8837 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008838 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008839 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008840 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008841
Benjamin Petersonbac79492012-01-14 13:34:47 -05008842 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008843 return -1;
8844 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008845 /* find all unencodable characters */
8846 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008847 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008848 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008849 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008850 val = encoding_map_lookup(ch, mapping);
8851 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 break;
8853 ++collendpos;
8854 continue;
8855 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008856
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008857 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8858 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008859 if (rep==NULL)
8860 return -1;
8861 else if (rep!=Py_None) {
8862 Py_DECREF(rep);
8863 break;
8864 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008865 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008866 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008867 }
8868 /* cache callback name lookup
8869 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008870 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008871 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008872
8873 switch (*error_handler) {
8874 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008875 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008876 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008877
8878 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008879 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 x = charmapencode_output('?', mapping, res, respos);
8881 if (x==enc_EXCEPTION) {
8882 return -1;
8883 }
8884 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008885 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 return -1;
8887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008888 }
8889 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008890 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008891 *inpos = collendpos;
8892 break;
Victor Stinner50149202015-09-22 00:26:54 +02008893
8894 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008895 /* generate replacement (temporarily (mis)uses p) */
8896 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 char buffer[2+29+1+1];
8898 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008899 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 for (cp = buffer; *cp; ++cp) {
8901 x = charmapencode_output(*cp, mapping, res, respos);
8902 if (x==enc_EXCEPTION)
8903 return -1;
8904 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008905 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 return -1;
8907 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008908 }
8909 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008910 *inpos = collendpos;
8911 break;
Victor Stinner50149202015-09-22 00:26:54 +02008912
Benjamin Peterson14339b62009-01-31 16:36:08 +00008913 default:
Victor Stinner50149202015-09-22 00:26:54 +02008914 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008915 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008917 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008919 if (PyBytes_Check(repunicode)) {
8920 /* Directly copy bytes result to output. */
8921 Py_ssize_t outsize = PyBytes_Size(*res);
8922 Py_ssize_t requiredsize;
8923 repsize = PyBytes_Size(repunicode);
8924 requiredsize = *respos + repsize;
8925 if (requiredsize > outsize)
8926 /* Make room for all additional bytes. */
8927 if (charmapencode_resize(res, respos, requiredsize)) {
8928 Py_DECREF(repunicode);
8929 return -1;
8930 }
8931 memcpy(PyBytes_AsString(*res) + *respos,
8932 PyBytes_AsString(repunicode), repsize);
8933 *respos += repsize;
8934 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008935 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008936 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008937 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008938 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008939 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008940 Py_DECREF(repunicode);
8941 return -1;
8942 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008943 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008944 data = PyUnicode_DATA(repunicode);
8945 kind = PyUnicode_KIND(repunicode);
8946 for (index = 0; index < repsize; index++) {
8947 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8948 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008950 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 return -1;
8952 }
8953 else if (x==enc_FAILED) {
8954 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008955 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 return -1;
8957 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008958 }
8959 *inpos = newpos;
8960 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008961 }
8962 return 0;
8963}
8964
Alexander Belopolsky40018472011-02-26 01:02:56 +00008965PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008966_PyUnicode_EncodeCharmap(PyObject *unicode,
8967 PyObject *mapping,
8968 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008970 /* output object */
8971 PyObject *res = NULL;
8972 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008973 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008974 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008975 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008976 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008977 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008978 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008979 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008980 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008981 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982
Benjamin Petersonbac79492012-01-14 13:34:47 -05008983 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008984 return NULL;
8985 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008986 data = PyUnicode_DATA(unicode);
8987 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008988
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 /* Default to Latin-1 */
8990 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008991 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008993 /* allocate enough for a simple encoding without
8994 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008995 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008996 if (res == NULL)
8997 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008998 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009001 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02009002 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009004 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 if (x==enc_EXCEPTION) /* error */
9006 goto onError;
9007 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009008 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02009010 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00009011 &res, &respos)) {
9012 goto onError;
9013 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009014 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 else
9016 /* done with this character => adjust input position */
9017 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009020 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00009021 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00009022 if (_PyBytes_Resize(&res, respos) < 0)
9023 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00009024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009025 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02009026 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009027 return res;
9028
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009030 Py_XDECREF(res);
9031 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02009032 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 return NULL;
9034}
9035
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009036/* Deprecated */
9037PyObject *
9038PyUnicode_EncodeCharmap(const Py_UNICODE *p,
9039 Py_ssize_t size,
9040 PyObject *mapping,
9041 const char *errors)
9042{
9043 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009044 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009045 if (unicode == NULL)
9046 return NULL;
9047 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
9048 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01009049 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009050}
9051
Alexander Belopolsky40018472011-02-26 01:02:56 +00009052PyObject *
9053PyUnicode_AsCharmapString(PyObject *unicode,
9054 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055{
9056 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 PyErr_BadArgument();
9058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009060 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061}
9062
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009063/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009064static void
9065make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009067 Py_ssize_t startpos, Py_ssize_t endpos,
9068 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009070 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 *exceptionObject = _PyUnicodeTranslateError_Create(
9072 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073 }
9074 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9076 goto onError;
9077 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9078 goto onError;
9079 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9080 goto onError;
9081 return;
9082 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02009083 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084 }
9085}
9086
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009087/* error handling callback helper:
9088 build arguments, call the callback and check the arguments,
9089 put the result into newpos and return the replacement string, which
9090 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009091static PyObject *
9092unicode_translate_call_errorhandler(const char *errors,
9093 PyObject **errorHandler,
9094 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009096 Py_ssize_t startpos, Py_ssize_t endpos,
9097 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009098{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009099 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009100
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009101 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009102 PyObject *restuple;
9103 PyObject *resunicode;
9104
9105 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009107 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009108 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009109 }
9110
9111 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009113 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009115
Petr Viktorinffd97532020-02-11 17:46:57 +01009116 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009117 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009119 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009120 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 Py_DECREF(restuple);
9122 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009123 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009124 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 &resunicode, &i_newpos)) {
9126 Py_DECREF(restuple);
9127 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009128 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009129 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009131 else
9132 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009134 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 Py_DECREF(restuple);
9136 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009137 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009138 Py_INCREF(resunicode);
9139 Py_DECREF(restuple);
9140 return resunicode;
9141}
9142
9143/* Lookup the character ch in the mapping and put the result in result,
9144 which must be decrefed by the caller.
9145 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009146static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009148{
Christian Heimes217cfd12007-12-02 14:31:20 +00009149 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009150 PyObject *x;
9151
9152 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009154 x = PyObject_GetItem(mapping, w);
9155 Py_DECREF(w);
9156 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9158 /* No mapping found means: use 1:1 mapping. */
9159 PyErr_Clear();
9160 *result = NULL;
9161 return 0;
9162 } else
9163 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009164 }
9165 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 *result = x;
9167 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009168 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009169 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009170 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009171 if (value < 0 || value > MAX_UNICODE) {
9172 PyErr_Format(PyExc_ValueError,
9173 "character mapping must be in range(0x%x)",
9174 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 Py_DECREF(x);
9176 return -1;
9177 }
9178 *result = x;
9179 return 0;
9180 }
9181 else if (PyUnicode_Check(x)) {
9182 *result = x;
9183 return 0;
9184 }
9185 else {
9186 /* wrong return value */
9187 PyErr_SetString(PyExc_TypeError,
9188 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009189 Py_DECREF(x);
9190 return -1;
9191 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009192}
Victor Stinner1194ea02014-04-04 19:37:40 +02009193
9194/* lookup the character, write the result into the writer.
9195 Return 1 if the result was written into the writer, return 0 if the mapping
9196 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009197static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009198charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9199 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009200{
Victor Stinner1194ea02014-04-04 19:37:40 +02009201 PyObject *item;
9202
9203 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009205
9206 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009208 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009211 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009212 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009213
9214 if (item == Py_None) {
9215 Py_DECREF(item);
9216 return 0;
9217 }
9218
9219 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009220 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9221 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9222 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009223 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9224 Py_DECREF(item);
9225 return -1;
9226 }
9227 Py_DECREF(item);
9228 return 1;
9229 }
9230
9231 if (!PyUnicode_Check(item)) {
9232 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009233 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009234 }
9235
9236 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9237 Py_DECREF(item);
9238 return -1;
9239 }
9240
9241 Py_DECREF(item);
9242 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009243}
9244
Victor Stinner89a76ab2014-04-05 11:44:04 +02009245static int
9246unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9247 Py_UCS1 *translate)
9248{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009249 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009250 int ret = 0;
9251
Victor Stinner89a76ab2014-04-05 11:44:04 +02009252 if (charmaptranslate_lookup(ch, mapping, &item)) {
9253 return -1;
9254 }
9255
9256 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009257 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009258 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009259 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009260 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009261 /* not found => default to 1:1 mapping */
9262 translate[ch] = ch;
9263 return 1;
9264 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009265 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009266 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009267 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9268 used it */
9269 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009270 /* invalid character or character outside ASCII:
9271 skip the fast translate */
9272 goto exit;
9273 }
9274 translate[ch] = (Py_UCS1)replace;
9275 }
9276 else if (PyUnicode_Check(item)) {
9277 Py_UCS4 replace;
9278
9279 if (PyUnicode_READY(item) == -1) {
9280 Py_DECREF(item);
9281 return -1;
9282 }
9283 if (PyUnicode_GET_LENGTH(item) != 1)
9284 goto exit;
9285
9286 replace = PyUnicode_READ_CHAR(item, 0);
9287 if (replace > 127)
9288 goto exit;
9289 translate[ch] = (Py_UCS1)replace;
9290 }
9291 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009292 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009293 goto exit;
9294 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009295 ret = 1;
9296
Benjamin Peterson1365de72014-04-07 20:15:41 -04009297 exit:
9298 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009299 return ret;
9300}
9301
9302/* Fast path for ascii => ascii translation. Return 1 if the whole string
9303 was translated into writer, return 0 if the input string was partially
9304 translated into writer, raise an exception and return -1 on error. */
9305static int
9306unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009307 _PyUnicodeWriter *writer, int ignore,
9308 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009309{
Victor Stinner872b2912014-04-05 14:27:07 +02009310 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009311 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009312 const Py_UCS1 *in, *end;
9313 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009314 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009315
Victor Stinner89a76ab2014-04-05 11:44:04 +02009316 len = PyUnicode_GET_LENGTH(input);
9317
Victor Stinner872b2912014-04-05 14:27:07 +02009318 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009319
9320 in = PyUnicode_1BYTE_DATA(input);
9321 end = in + len;
9322
9323 assert(PyUnicode_IS_ASCII(writer->buffer));
9324 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9325 out = PyUnicode_1BYTE_DATA(writer->buffer);
9326
Victor Stinner872b2912014-04-05 14:27:07 +02009327 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009328 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009329 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009330 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009331 int translate = unicode_fast_translate_lookup(mapping, ch,
9332 ascii_table);
9333 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009334 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009335 if (translate == 0)
9336 goto exit;
9337 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009338 }
Victor Stinner872b2912014-04-05 14:27:07 +02009339 if (ch2 == 0xfe) {
9340 if (ignore)
9341 continue;
9342 goto exit;
9343 }
9344 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009345 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009346 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009347 }
Victor Stinner872b2912014-04-05 14:27:07 +02009348 res = 1;
9349
9350exit:
9351 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009352 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009353 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009354}
9355
Victor Stinner3222da22015-10-01 22:07:32 +02009356static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357_PyUnicode_TranslateCharmap(PyObject *input,
9358 PyObject *mapping,
9359 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009362 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 Py_ssize_t size, i;
9364 int kind;
9365 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009366 _PyUnicodeWriter writer;
9367 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009368 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009369 PyObject *errorHandler = NULL;
9370 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009371 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009372 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009373
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009375 PyErr_BadArgument();
9376 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 if (PyUnicode_READY(input) == -1)
9380 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009381 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 kind = PyUnicode_KIND(input);
9383 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009385 if (size == 0)
9386 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009388 /* allocate enough for a simple 1:1 translation without
9389 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009390 _PyUnicodeWriter_Init(&writer);
9391 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009392 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393
Victor Stinner872b2912014-04-05 14:27:07 +02009394 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9395
Victor Stinner33798672016-03-01 21:59:58 +01009396 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009397 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009398 if (PyUnicode_IS_ASCII(input)) {
9399 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9400 if (res < 0) {
9401 _PyUnicodeWriter_Dealloc(&writer);
9402 return NULL;
9403 }
9404 if (res == 1)
9405 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009406 }
Victor Stinner33798672016-03-01 21:59:58 +01009407 else {
9408 i = 0;
9409 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009412 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009413 int translate;
9414 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9415 Py_ssize_t newpos;
9416 /* startpos for collecting untranslatable chars */
9417 Py_ssize_t collstart;
9418 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009419 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420
Victor Stinner1194ea02014-04-04 19:37:40 +02009421 ch = PyUnicode_READ(kind, data, i);
9422 translate = charmaptranslate_output(ch, mapping, &writer);
9423 if (translate < 0)
9424 goto onError;
9425
9426 if (translate != 0) {
9427 /* it worked => adjust input pointer */
9428 ++i;
9429 continue;
9430 }
9431
9432 /* untranslatable character */
9433 collstart = i;
9434 collend = i+1;
9435
9436 /* find all untranslatable characters */
9437 while (collend < size) {
9438 PyObject *x;
9439 ch = PyUnicode_READ(kind, data, collend);
9440 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009441 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009442 Py_XDECREF(x);
9443 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009444 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009445 ++collend;
9446 }
9447
9448 if (ignore) {
9449 i = collend;
9450 }
9451 else {
9452 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9453 reason, input, &exc,
9454 collstart, collend, &newpos);
9455 if (repunicode == NULL)
9456 goto onError;
9457 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009458 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009459 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009460 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009461 Py_DECREF(repunicode);
9462 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009463 }
9464 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009465 Py_XDECREF(exc);
9466 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009467 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468
Benjamin Peterson29060642009-01-31 22:14:21 +00009469 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009470 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009471 Py_XDECREF(exc);
9472 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473 return NULL;
9474}
9475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476/* Deprecated. Use PyUnicode_Translate instead. */
9477PyObject *
9478PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9479 Py_ssize_t size,
9480 PyObject *mapping,
9481 const char *errors)
9482{
Christian Heimes5f520f42012-09-11 14:03:25 +02009483 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009484 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 if (!unicode)
9486 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009487 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9488 Py_DECREF(unicode);
9489 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490}
9491
Alexander Belopolsky40018472011-02-26 01:02:56 +00009492PyObject *
9493PyUnicode_Translate(PyObject *str,
9494 PyObject *mapping,
9495 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009497 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009498 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009499 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500}
Tim Petersced69f82003-09-16 20:30:58 +00009501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502PyObject *
9503_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9504{
9505 if (!PyUnicode_Check(unicode)) {
9506 PyErr_BadInternalCall();
9507 return NULL;
9508 }
9509 if (PyUnicode_READY(unicode) == -1)
9510 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009511 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 /* If the string is already ASCII, just return the same string */
9513 Py_INCREF(unicode);
9514 return unicode;
9515 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009516
9517 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9518 PyObject *result = PyUnicode_New(len, 127);
9519 if (result == NULL) {
9520 return NULL;
9521 }
9522
9523 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9524 int kind = PyUnicode_KIND(unicode);
9525 const void *data = PyUnicode_DATA(unicode);
9526 Py_ssize_t i;
9527 for (i = 0; i < len; ++i) {
9528 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9529 if (ch < 127) {
9530 out[i] = ch;
9531 }
9532 else if (Py_UNICODE_ISSPACE(ch)) {
9533 out[i] = ' ';
9534 }
9535 else {
9536 int decimal = Py_UNICODE_TODECIMAL(ch);
9537 if (decimal < 0) {
9538 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009539 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009540 _PyUnicode_LENGTH(result) = i + 1;
9541 break;
9542 }
9543 out[i] = '0' + decimal;
9544 }
9545 }
9546
INADA Naoki16dfca42018-07-14 12:06:43 +09009547 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009548 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549}
9550
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009551PyObject *
9552PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9553 Py_ssize_t length)
9554{
Victor Stinnerf0124502011-11-21 23:12:56 +01009555 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009556 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009557 Py_UCS4 maxchar;
9558 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009559 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009560
Victor Stinner99d7ad02012-02-22 13:37:39 +01009561 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009562 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009563 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009564 if (ch > 127) {
9565 int decimal = Py_UNICODE_TODECIMAL(ch);
9566 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009567 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009568 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009569 }
9570 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009571
9572 /* Copy to a new string */
9573 decimal = PyUnicode_New(length, maxchar);
9574 if (decimal == NULL)
9575 return decimal;
9576 kind = PyUnicode_KIND(decimal);
9577 data = PyUnicode_DATA(decimal);
9578 /* Iterate over code points */
9579 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009580 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009581 if (ch > 127) {
9582 int decimal = Py_UNICODE_TODECIMAL(ch);
9583 if (decimal >= 0)
9584 ch = '0' + decimal;
9585 }
9586 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009587 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009588 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009589}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009590/* --- Decimal Encoder ---------------------------------------------------- */
9591
Alexander Belopolsky40018472011-02-26 01:02:56 +00009592int
9593PyUnicode_EncodeDecimal(Py_UNICODE *s,
9594 Py_ssize_t length,
9595 char *output,
9596 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009597{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009598 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009599 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009600 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009601 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009602
9603 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 PyErr_BadArgument();
9605 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009606 }
9607
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009608 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009609 if (unicode == NULL)
9610 return -1;
9611
Victor Stinner42bf7752011-11-21 22:52:58 +01009612 kind = PyUnicode_KIND(unicode);
9613 data = PyUnicode_DATA(unicode);
9614
Victor Stinnerb84d7232011-11-22 01:50:07 +01009615 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009616 PyObject *exc;
9617 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009618 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009619 Py_ssize_t startpos;
9620
9621 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009622
Benjamin Peterson29060642009-01-31 22:14:21 +00009623 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009624 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009625 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009626 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009627 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 decimal = Py_UNICODE_TODECIMAL(ch);
9629 if (decimal >= 0) {
9630 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009631 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009632 continue;
9633 }
9634 if (0 < ch && ch < 256) {
9635 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009636 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009637 continue;
9638 }
Victor Stinner6345be92011-11-25 20:09:01 +01009639
Victor Stinner42bf7752011-11-21 22:52:58 +01009640 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009641 exc = NULL;
9642 raise_encode_exception(&exc, "decimal", unicode,
9643 startpos, startpos+1,
9644 "invalid decimal Unicode string");
9645 Py_XDECREF(exc);
9646 Py_DECREF(unicode);
9647 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009648 }
9649 /* 0-terminate the output string */
9650 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009651 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009652 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009653}
9654
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655/* --- Helpers ------------------------------------------------------------ */
9656
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009657/* helper macro to fixup start/end slice values */
9658#define ADJUST_INDICES(start, end, len) \
9659 if (end > len) \
9660 end = len; \
9661 else if (end < 0) { \
9662 end += len; \
9663 if (end < 0) \
9664 end = 0; \
9665 } \
9666 if (start < 0) { \
9667 start += len; \
9668 if (start < 0) \
9669 start = 0; \
9670 }
9671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009673any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009675 Py_ssize_t end,
9676 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009678 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009679 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 Py_ssize_t len1, len2, result;
9681
9682 kind1 = PyUnicode_KIND(s1);
9683 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009684 if (kind1 < kind2)
9685 return -1;
9686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 len1 = PyUnicode_GET_LENGTH(s1);
9688 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009689 ADJUST_INDICES(start, end, len1);
9690 if (end - start < len2)
9691 return -1;
9692
9693 buf1 = PyUnicode_DATA(s1);
9694 buf2 = PyUnicode_DATA(s2);
9695 if (len2 == 1) {
9696 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9697 result = findchar((const char *)buf1 + kind1*start,
9698 kind1, end - start, ch, direction);
9699 if (result == -1)
9700 return -1;
9701 else
9702 return start + result;
9703 }
9704
9705 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009706 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009707 if (!buf2)
9708 return -2;
9709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710
Victor Stinner794d5672011-10-10 03:21:36 +02009711 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009712 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009713 case PyUnicode_1BYTE_KIND:
9714 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9715 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9716 else
9717 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9718 break;
9719 case PyUnicode_2BYTE_KIND:
9720 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9721 break;
9722 case PyUnicode_4BYTE_KIND:
9723 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9724 break;
9725 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009726 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009727 }
9728 }
9729 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009730 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009731 case PyUnicode_1BYTE_KIND:
9732 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9733 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9734 else
9735 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9736 break;
9737 case PyUnicode_2BYTE_KIND:
9738 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9739 break;
9740 case PyUnicode_4BYTE_KIND:
9741 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9742 break;
9743 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009744 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009746 }
9747
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009748 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009749 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009750 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751
9752 return result;
9753}
9754
Victor Stinner59423e32018-11-26 13:40:01 +01009755/* _PyUnicode_InsertThousandsGrouping() helper functions */
9756#include "stringlib/localeutil.h"
9757
9758/**
9759 * InsertThousandsGrouping:
9760 * @writer: Unicode writer.
9761 * @n_buffer: Number of characters in @buffer.
9762 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9763 * @d_pos: Start of digits string.
9764 * @n_digits: The number of digits in the string, in which we want
9765 * to put the grouping chars.
9766 * @min_width: The minimum width of the digits in the output string.
9767 * Output will be zero-padded on the left to fill.
9768 * @grouping: see definition in localeconv().
9769 * @thousands_sep: see definition in localeconv().
9770 *
9771 * There are 2 modes: counting and filling. If @writer is NULL,
9772 * we are in counting mode, else filling mode.
9773 * If counting, the required buffer size is returned.
9774 * If filling, we know the buffer will be large enough, so we don't
9775 * need to pass in the buffer size.
9776 * Inserts thousand grouping characters (as defined by grouping and
9777 * thousands_sep) into @writer.
9778 *
9779 * Return value: -1 on error, number of characters otherwise.
9780 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009782_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009783 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009784 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009785 PyObject *digits,
9786 Py_ssize_t d_pos,
9787 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009788 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009789 const char *grouping,
9790 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009791 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792{
Xtreak3f7983a2019-01-07 20:39:14 +05309793 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009794 if (writer) {
9795 assert(digits != NULL);
9796 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009797 }
9798 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009799 assert(digits == NULL);
9800 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009801 }
Victor Stinner59423e32018-11-26 13:40:01 +01009802 assert(0 <= d_pos);
9803 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009804 assert(grouping != NULL);
9805
9806 if (digits != NULL) {
9807 if (PyUnicode_READY(digits) == -1) {
9808 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009809 }
Victor Stinner59423e32018-11-26 13:40:01 +01009810 }
9811 if (PyUnicode_READY(thousands_sep) == -1) {
9812 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009813 }
9814
Victor Stinner59423e32018-11-26 13:40:01 +01009815 Py_ssize_t count = 0;
9816 Py_ssize_t n_zeros;
9817 int loop_broken = 0;
9818 int use_separator = 0; /* First time through, don't append the
9819 separator. They only go between
9820 groups. */
9821 Py_ssize_t buffer_pos;
9822 Py_ssize_t digits_pos;
9823 Py_ssize_t len;
9824 Py_ssize_t n_chars;
9825 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9826 be looked at */
9827 /* A generator that returns all of the grouping widths, until it
9828 returns 0. */
9829 GroupGenerator groupgen;
9830 GroupGenerator_init(&groupgen, grouping);
9831 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9832
9833 /* if digits are not grouped, thousands separator
9834 should be an empty string */
9835 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9836
9837 digits_pos = d_pos + n_digits;
9838 if (writer) {
9839 buffer_pos = writer->pos + n_buffer;
9840 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9841 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 }
Victor Stinner59423e32018-11-26 13:40:01 +01009843 else {
9844 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009845 }
Victor Stinner59423e32018-11-26 13:40:01 +01009846
9847 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009848 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009849 }
Victor Stinner59423e32018-11-26 13:40:01 +01009850
9851 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9852 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9853 n_zeros = Py_MAX(0, len - remaining);
9854 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9855
9856 /* Use n_zero zero's and n_chars chars */
9857
9858 /* Count only, don't do anything. */
9859 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9860
9861 /* Copy into the writer. */
9862 InsertThousandsGrouping_fill(writer, &buffer_pos,
9863 digits, &digits_pos,
9864 n_chars, n_zeros,
9865 use_separator ? thousands_sep : NULL,
9866 thousands_sep_len, maxchar);
9867
9868 /* Use a separator next time. */
9869 use_separator = 1;
9870
9871 remaining -= n_chars;
9872 min_width -= len;
9873
9874 if (remaining <= 0 && min_width <= 0) {
9875 loop_broken = 1;
9876 break;
9877 }
9878 min_width -= thousands_sep_len;
9879 }
9880 if (!loop_broken) {
9881 /* We left the loop without using a break statement. */
9882
9883 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9884 n_zeros = Py_MAX(0, len - remaining);
9885 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9886
9887 /* Use n_zero zero's and n_chars chars */
9888 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9889
9890 /* Copy into the writer. */
9891 InsertThousandsGrouping_fill(writer, &buffer_pos,
9892 digits, &digits_pos,
9893 n_chars, n_zeros,
9894 use_separator ? thousands_sep : NULL,
9895 thousands_sep_len, maxchar);
9896 }
9897 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898}
9899
9900
Alexander Belopolsky40018472011-02-26 01:02:56 +00009901Py_ssize_t
9902PyUnicode_Count(PyObject *str,
9903 PyObject *substr,
9904 Py_ssize_t start,
9905 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009907 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009908 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009909 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009911
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009912 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009914
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009915 kind1 = PyUnicode_KIND(str);
9916 kind2 = PyUnicode_KIND(substr);
9917 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009918 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009919
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009920 len1 = PyUnicode_GET_LENGTH(str);
9921 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009923 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009924 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009925
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009926 buf1 = PyUnicode_DATA(str);
9927 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009928 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009929 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009930 if (!buf2)
9931 goto onError;
9932 }
9933
9934 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009936 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009937 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009938 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009939 buf2, len2, PY_SSIZE_T_MAX
9940 );
9941 else
9942 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009943 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009944 buf2, len2, PY_SSIZE_T_MAX
9945 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 break;
9947 case PyUnicode_2BYTE_KIND:
9948 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009949 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 buf2, len2, PY_SSIZE_T_MAX
9951 );
9952 break;
9953 case PyUnicode_4BYTE_KIND:
9954 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009955 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 buf2, len2, PY_SSIZE_T_MAX
9957 );
9958 break;
9959 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009960 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009962
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009963 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009964 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009965 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009969 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9970 if (kind2 != kind1)
9971 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973}
9974
Alexander Belopolsky40018472011-02-26 01:02:56 +00009975Py_ssize_t
9976PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009977 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009978 Py_ssize_t start,
9979 Py_ssize_t end,
9980 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009982 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009984
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009985 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986}
9987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988Py_ssize_t
9989PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9990 Py_ssize_t start, Py_ssize_t end,
9991 int direction)
9992{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009994 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 if (PyUnicode_READY(str) == -1)
9996 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009997 len = PyUnicode_GET_LENGTH(str);
9998 ADJUST_INDICES(start, end, len);
9999 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010000 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010002 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
10003 kind, end-start, ch, direction);
10004 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010006 else
10007 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008}
10009
Alexander Belopolsky40018472011-02-26 01:02:56 +000010010static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010011tailmatch(PyObject *self,
10012 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010013 Py_ssize_t start,
10014 Py_ssize_t end,
10015 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 int kind_self;
10018 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010019 const void *data_self;
10020 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 Py_ssize_t offset;
10022 Py_ssize_t i;
10023 Py_ssize_t end_sub;
10024
10025 if (PyUnicode_READY(self) == -1 ||
10026 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +010010027 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
10030 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +000010032 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +030010034 if (PyUnicode_GET_LENGTH(substring) == 0)
10035 return 1;
10036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 kind_self = PyUnicode_KIND(self);
10038 data_self = PyUnicode_DATA(self);
10039 kind_sub = PyUnicode_KIND(substring);
10040 data_sub = PyUnicode_DATA(substring);
10041 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
10042
10043 if (direction > 0)
10044 offset = end;
10045 else
10046 offset = start;
10047
10048 if (PyUnicode_READ(kind_self, data_self, offset) ==
10049 PyUnicode_READ(kind_sub, data_sub, 0) &&
10050 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
10051 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
10052 /* If both are of the same kind, memcmp is sufficient */
10053 if (kind_self == kind_sub) {
10054 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010055 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 data_sub,
10057 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010058 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 }
Martin Pantere26da7c2016-06-02 10:07:09 +000010060 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 else {
10062 /* We do not need to compare 0 and len(substring)-1 because
10063 the if statement above ensured already that they are equal
10064 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 for (i = 1; i < end_sub; ++i) {
10066 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10067 PyUnicode_READ(kind_sub, data_sub, i))
10068 return 0;
10069 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010070 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010072 }
10073
10074 return 0;
10075}
10076
Alexander Belopolsky40018472011-02-26 01:02:56 +000010077Py_ssize_t
10078PyUnicode_Tailmatch(PyObject *str,
10079 PyObject *substr,
10080 Py_ssize_t start,
10081 Py_ssize_t end,
10082 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010084 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010085 return -1;
Tim Petersced69f82003-09-16 20:30:58 +000010086
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010087 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088}
10089
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010090static PyObject *
10091ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010092{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010093 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010094 const char *data = PyUnicode_DATA(self);
10095 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010096 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +000010097
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010098 res = PyUnicode_New(len, 127);
10099 if (res == NULL)
10100 return NULL;
10101 resdata = PyUnicode_DATA(res);
10102 if (lower)
10103 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010105 _Py_bytes_upper(resdata, data, len);
10106 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107}
10108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010110handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010112 Py_ssize_t j;
10113 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010010114 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010115 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010116
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010117 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10118
10119 where ! is a negation and \p{xxx} is a character with property xxx.
10120 */
10121 for (j = i - 1; j >= 0; j--) {
10122 c = PyUnicode_READ(kind, data, j);
10123 if (!_PyUnicode_IsCaseIgnorable(c))
10124 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010126 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10127 if (final_sigma) {
10128 for (j = i + 1; j < length; j++) {
10129 c = PyUnicode_READ(kind, data, j);
10130 if (!_PyUnicode_IsCaseIgnorable(c))
10131 break;
10132 }
10133 final_sigma = j == length || !_PyUnicode_IsCased(c);
10134 }
10135 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136}
10137
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010138static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010139lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010140 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010142 /* Obscure special case. */
10143 if (c == 0x3A3) {
10144 mapped[0] = handle_capital_sigma(kind, data, length, i);
10145 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010147 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148}
10149
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010150static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010151do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010153 Py_ssize_t i, k = 0;
10154 int n_res, j;
10155 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010156
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010157 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010158 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010159 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010160 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010161 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010163 for (i = 1; i < length; i++) {
10164 c = PyUnicode_READ(kind, data, i);
10165 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10166 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010167 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010168 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010169 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010170 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010171 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172}
10173
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010174static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010175do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010176 Py_ssize_t i, k = 0;
10177
10178 for (i = 0; i < length; i++) {
10179 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10180 int n_res, j;
10181 if (Py_UNICODE_ISUPPER(c)) {
10182 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10183 }
10184 else if (Py_UNICODE_ISLOWER(c)) {
10185 n_res = _PyUnicode_ToUpperFull(c, mapped);
10186 }
10187 else {
10188 n_res = 1;
10189 mapped[0] = c;
10190 }
10191 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010192 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010193 res[k++] = mapped[j];
10194 }
10195 }
10196 return k;
10197}
10198
10199static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010200do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010201 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010203 Py_ssize_t i, k = 0;
10204
10205 for (i = 0; i < length; i++) {
10206 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10207 int n_res, j;
10208 if (lower)
10209 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10210 else
10211 n_res = _PyUnicode_ToUpperFull(c, mapped);
10212 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010213 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010214 res[k++] = mapped[j];
10215 }
10216 }
10217 return k;
10218}
10219
10220static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010221do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010222{
10223 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10224}
10225
10226static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010227do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010228{
10229 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10230}
10231
Benjamin Petersone51757f2012-01-12 21:10:29 -050010232static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010233do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010234{
10235 Py_ssize_t i, k = 0;
10236
10237 for (i = 0; i < length; i++) {
10238 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10239 Py_UCS4 mapped[3];
10240 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10241 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010242 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010243 res[k++] = mapped[j];
10244 }
10245 }
10246 return k;
10247}
10248
10249static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010250do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010251{
10252 Py_ssize_t i, k = 0;
10253 int previous_is_cased;
10254
10255 previous_is_cased = 0;
10256 for (i = 0; i < length; i++) {
10257 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10258 Py_UCS4 mapped[3];
10259 int n_res, j;
10260
10261 if (previous_is_cased)
10262 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10263 else
10264 n_res = _PyUnicode_ToTitleFull(c, mapped);
10265
10266 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010267 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010268 res[k++] = mapped[j];
10269 }
10270
10271 previous_is_cased = _PyUnicode_IsCased(c);
10272 }
10273 return k;
10274}
10275
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010276static PyObject *
10277case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010278 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010279{
10280 PyObject *res = NULL;
10281 Py_ssize_t length, newlength = 0;
10282 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010283 const void *data;
10284 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010285 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10286
Benjamin Petersoneea48462012-01-16 14:28:50 -050010287 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010288
10289 kind = PyUnicode_KIND(self);
10290 data = PyUnicode_DATA(self);
10291 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010292 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010293 PyErr_SetString(PyExc_OverflowError, "string is too long");
10294 return NULL;
10295 }
Victor Stinner00d7abd2020-12-01 09:56:42 +010010296 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010297 if (tmp == NULL)
10298 return PyErr_NoMemory();
10299 newlength = perform(kind, data, length, tmp, &maxchar);
10300 res = PyUnicode_New(newlength, maxchar);
10301 if (res == NULL)
10302 goto leave;
10303 tmpend = tmp + newlength;
10304 outdata = PyUnicode_DATA(res);
10305 outkind = PyUnicode_KIND(res);
10306 switch (outkind) {
10307 case PyUnicode_1BYTE_KIND:
10308 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10309 break;
10310 case PyUnicode_2BYTE_KIND:
10311 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10312 break;
10313 case PyUnicode_4BYTE_KIND:
10314 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10315 break;
10316 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010317 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010318 }
10319 leave:
Victor Stinner00d7abd2020-12-01 09:56:42 +010010320 PyMem_Free(tmp);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010321 return res;
10322}
10323
Tim Peters8ce9f162004-08-27 01:49:32 +000010324PyObject *
10325PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010327 PyObject *res;
10328 PyObject *fseq;
10329 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010330 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010332 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010333 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010334 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010335 }
10336
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010337 /* NOTE: the following code can't call back into Python code,
10338 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010339 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010340
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010341 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010342 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010343 res = _PyUnicode_JoinArray(separator, items, seqlen);
10344 Py_DECREF(fseq);
10345 return res;
10346}
10347
10348PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010349_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010350{
10351 PyObject *res = NULL; /* the result */
10352 PyObject *sep = NULL;
10353 Py_ssize_t seplen;
10354 PyObject *item;
10355 Py_ssize_t sz, i, res_offset;
10356 Py_UCS4 maxchar;
10357 Py_UCS4 item_maxchar;
10358 int use_memcpy;
10359 unsigned char *res_data = NULL, *sep_data = NULL;
10360 PyObject *last_obj;
10361 unsigned int kind = 0;
10362
Tim Peters05eba1f2004-08-27 21:32:02 +000010363 /* If empty sequence, return u"". */
10364 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010365 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010366 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010367
Tim Peters05eba1f2004-08-27 21:32:02 +000010368 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010369 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010370 if (seqlen == 1) {
10371 if (PyUnicode_CheckExact(items[0])) {
10372 res = items[0];
10373 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010374 return res;
10375 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010376 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010377 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010378 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010379 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010380 /* Set up sep and seplen */
10381 if (separator == NULL) {
10382 /* fall back to a blank space separator */
10383 sep = PyUnicode_FromOrdinal(' ');
10384 if (!sep)
10385 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010386 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010387 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010388 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010389 else {
10390 if (!PyUnicode_Check(separator)) {
10391 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010392 "separator: expected str instance,"
10393 " %.80s found",
10394 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010395 goto onError;
10396 }
10397 if (PyUnicode_READY(separator))
10398 goto onError;
10399 sep = separator;
10400 seplen = PyUnicode_GET_LENGTH(separator);
10401 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10402 /* inc refcount to keep this code path symmetric with the
10403 above case of a blank separator */
10404 Py_INCREF(sep);
10405 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010406 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010407 }
10408
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010409 /* There are at least two things to join, or else we have a subclass
10410 * of str in the sequence.
10411 * Do a pre-pass to figure out the total amount of space we'll
10412 * need (sz), and see whether all argument are strings.
10413 */
10414 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010415#ifdef Py_DEBUG
10416 use_memcpy = 0;
10417#else
10418 use_memcpy = 1;
10419#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010420 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010421 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010422 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010423 if (!PyUnicode_Check(item)) {
10424 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010425 "sequence item %zd: expected str instance,"
10426 " %.80s found",
10427 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010428 goto onError;
10429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 if (PyUnicode_READY(item) == -1)
10431 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010432 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010434 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010435 if (i != 0) {
10436 add_sz += seplen;
10437 }
10438 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010439 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010440 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010441 goto onError;
10442 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010443 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010444 if (use_memcpy && last_obj != NULL) {
10445 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10446 use_memcpy = 0;
10447 }
10448 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010449 }
Tim Petersced69f82003-09-16 20:30:58 +000010450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010452 if (res == NULL)
10453 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010454
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010455 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010456#ifdef Py_DEBUG
10457 use_memcpy = 0;
10458#else
10459 if (use_memcpy) {
10460 res_data = PyUnicode_1BYTE_DATA(res);
10461 kind = PyUnicode_KIND(res);
10462 if (seplen != 0)
10463 sep_data = PyUnicode_1BYTE_DATA(sep);
10464 }
10465#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010466 if (use_memcpy) {
10467 for (i = 0; i < seqlen; ++i) {
10468 Py_ssize_t itemlen;
10469 item = items[i];
10470
10471 /* Copy item, and maybe the separator. */
10472 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010473 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010474 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010475 kind * seplen);
10476 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010477 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010478
10479 itemlen = PyUnicode_GET_LENGTH(item);
10480 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010481 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010482 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010483 kind * itemlen);
10484 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010485 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010486 }
10487 assert(res_data == PyUnicode_1BYTE_DATA(res)
10488 + kind * PyUnicode_GET_LENGTH(res));
10489 }
10490 else {
10491 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10492 Py_ssize_t itemlen;
10493 item = items[i];
10494
10495 /* Copy item, and maybe the separator. */
10496 if (i && seplen != 0) {
10497 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10498 res_offset += seplen;
10499 }
10500
10501 itemlen = PyUnicode_GET_LENGTH(item);
10502 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010503 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010504 res_offset += itemlen;
10505 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010506 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010507 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010508 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010511 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513
Benjamin Peterson29060642009-01-31 22:14:21 +000010514 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010516 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517 return NULL;
10518}
10519
Victor Stinnerd3f08822012-05-29 12:57:52 +020010520void
10521_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10522 Py_UCS4 fill_char)
10523{
10524 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010525 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010526 assert(PyUnicode_IS_READY(unicode));
10527 assert(unicode_modifiable(unicode));
10528 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10529 assert(start >= 0);
10530 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010531 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010532}
10533
Victor Stinner3fe55312012-01-04 00:33:50 +010010534Py_ssize_t
10535PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10536 Py_UCS4 fill_char)
10537{
10538 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010539
10540 if (!PyUnicode_Check(unicode)) {
10541 PyErr_BadInternalCall();
10542 return -1;
10543 }
10544 if (PyUnicode_READY(unicode) == -1)
10545 return -1;
10546 if (unicode_check_modifiable(unicode))
10547 return -1;
10548
Victor Stinnerd3f08822012-05-29 12:57:52 +020010549 if (start < 0) {
10550 PyErr_SetString(PyExc_IndexError, "string index out of range");
10551 return -1;
10552 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010553 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10554 PyErr_SetString(PyExc_ValueError,
10555 "fill character is bigger than "
10556 "the string maximum character");
10557 return -1;
10558 }
10559
10560 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10561 length = Py_MIN(maxlen, length);
10562 if (length <= 0)
10563 return 0;
10564
Victor Stinnerd3f08822012-05-29 12:57:52 +020010565 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010566 return length;
10567}
10568
Victor Stinner9310abb2011-10-05 00:59:23 +020010569static PyObject *
10570pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010571 Py_ssize_t left,
10572 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 PyObject *u;
10576 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010577 int kind;
10578 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579
10580 if (left < 0)
10581 left = 0;
10582 if (right < 0)
10583 right = 0;
10584
Victor Stinnerc4b49542011-12-11 22:44:26 +010010585 if (left == 0 && right == 0)
10586 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10589 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010590 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10591 return NULL;
10592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010594 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010596 if (!u)
10597 return NULL;
10598
10599 kind = PyUnicode_KIND(u);
10600 data = PyUnicode_DATA(u);
10601 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010602 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010603 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010604 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010605 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010606 assert(_PyUnicode_CheckConsistency(u, 1));
10607 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608}
10609
Alexander Belopolsky40018472011-02-26 01:02:56 +000010610PyObject *
10611PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010615 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617
Benjamin Petersonead6b532011-12-20 17:23:42 -060010618 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010620 if (PyUnicode_IS_ASCII(string))
10621 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010622 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010623 PyUnicode_GET_LENGTH(string), keepends);
10624 else
10625 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010626 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010627 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 break;
10629 case PyUnicode_2BYTE_KIND:
10630 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010631 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 PyUnicode_GET_LENGTH(string), keepends);
10633 break;
10634 case PyUnicode_4BYTE_KIND:
10635 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010636 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 PyUnicode_GET_LENGTH(string), keepends);
10638 break;
10639 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010640 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643}
10644
Alexander Belopolsky40018472011-02-26 01:02:56 +000010645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010646split(PyObject *self,
10647 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010648 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010650 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010651 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 Py_ssize_t len1, len2;
10653 PyObject* out;
10654
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010656 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 if (PyUnicode_READY(self) == -1)
10659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010662 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010664 if (PyUnicode_IS_ASCII(self))
10665 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010666 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010667 PyUnicode_GET_LENGTH(self), maxcount
10668 );
10669 else
10670 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010671 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010672 PyUnicode_GET_LENGTH(self), maxcount
10673 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 case PyUnicode_2BYTE_KIND:
10675 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010676 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 PyUnicode_GET_LENGTH(self), maxcount
10678 );
10679 case PyUnicode_4BYTE_KIND:
10680 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010681 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 PyUnicode_GET_LENGTH(self), maxcount
10683 );
10684 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010685 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 }
10687
10688 if (PyUnicode_READY(substring) == -1)
10689 return NULL;
10690
10691 kind1 = PyUnicode_KIND(self);
10692 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 len1 = PyUnicode_GET_LENGTH(self);
10694 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010695 if (kind1 < kind2 || len1 < len2) {
10696 out = PyList_New(1);
10697 if (out == NULL)
10698 return NULL;
10699 Py_INCREF(self);
10700 PyList_SET_ITEM(out, 0, self);
10701 return out;
10702 }
10703 buf1 = PyUnicode_DATA(self);
10704 buf2 = PyUnicode_DATA(substring);
10705 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010706 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010707 if (!buf2)
10708 return NULL;
10709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010711 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010713 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10714 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010715 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010716 else
10717 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010718 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 break;
10720 case PyUnicode_2BYTE_KIND:
10721 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010722 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 break;
10724 case PyUnicode_4BYTE_KIND:
10725 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010726 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 break;
10728 default:
10729 out = NULL;
10730 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010731 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010732 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010733 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735}
10736
Alexander Belopolsky40018472011-02-26 01:02:56 +000010737static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010738rsplit(PyObject *self,
10739 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010740 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010741{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010742 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010743 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 Py_ssize_t len1, len2;
10745 PyObject* out;
10746
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010747 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010748 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 if (PyUnicode_READY(self) == -1)
10751 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010754 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010756 if (PyUnicode_IS_ASCII(self))
10757 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010758 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010759 PyUnicode_GET_LENGTH(self), maxcount
10760 );
10761 else
10762 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010763 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010764 PyUnicode_GET_LENGTH(self), maxcount
10765 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 case PyUnicode_2BYTE_KIND:
10767 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010768 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 PyUnicode_GET_LENGTH(self), maxcount
10770 );
10771 case PyUnicode_4BYTE_KIND:
10772 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010773 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 PyUnicode_GET_LENGTH(self), maxcount
10775 );
10776 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010777 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 }
10779
10780 if (PyUnicode_READY(substring) == -1)
10781 return NULL;
10782
10783 kind1 = PyUnicode_KIND(self);
10784 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 len1 = PyUnicode_GET_LENGTH(self);
10786 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010787 if (kind1 < kind2 || len1 < len2) {
10788 out = PyList_New(1);
10789 if (out == NULL)
10790 return NULL;
10791 Py_INCREF(self);
10792 PyList_SET_ITEM(out, 0, self);
10793 return out;
10794 }
10795 buf1 = PyUnicode_DATA(self);
10796 buf2 = PyUnicode_DATA(substring);
10797 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010798 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010799 if (!buf2)
10800 return NULL;
10801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010803 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010805 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10806 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010807 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010808 else
10809 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010810 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 break;
10812 case PyUnicode_2BYTE_KIND:
10813 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010814 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 break;
10816 case PyUnicode_4BYTE_KIND:
10817 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010818 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 break;
10820 default:
10821 out = NULL;
10822 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010823 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010824 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010825 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 return out;
10827}
10828
10829static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010830anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10831 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010833 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010835 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10836 return asciilib_find(buf1, len1, buf2, len2, offset);
10837 else
10838 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 case PyUnicode_2BYTE_KIND:
10840 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10841 case PyUnicode_4BYTE_KIND:
10842 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10843 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010844 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845}
10846
10847static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010848anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10849 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010851 switch (kind) {
10852 case PyUnicode_1BYTE_KIND:
10853 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10854 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10855 else
10856 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10857 case PyUnicode_2BYTE_KIND:
10858 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10859 case PyUnicode_4BYTE_KIND:
10860 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10861 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010862 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010863}
10864
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010865static void
10866replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10867 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10868{
10869 int kind = PyUnicode_KIND(u);
10870 void *data = PyUnicode_DATA(u);
10871 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10872 if (kind == PyUnicode_1BYTE_KIND) {
10873 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10874 (Py_UCS1 *)data + len,
10875 u1, u2, maxcount);
10876 }
10877 else if (kind == PyUnicode_2BYTE_KIND) {
10878 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10879 (Py_UCS2 *)data + len,
10880 u1, u2, maxcount);
10881 }
10882 else {
10883 assert(kind == PyUnicode_4BYTE_KIND);
10884 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10885 (Py_UCS4 *)data + len,
10886 u1, u2, maxcount);
10887 }
10888}
10889
Alexander Belopolsky40018472011-02-26 01:02:56 +000010890static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891replace(PyObject *self, PyObject *str1,
10892 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010895 const char *sbuf = PyUnicode_DATA(self);
10896 const void *buf1 = PyUnicode_DATA(str1);
10897 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 int srelease = 0, release1 = 0, release2 = 0;
10899 int skind = PyUnicode_KIND(self);
10900 int kind1 = PyUnicode_KIND(str1);
10901 int kind2 = PyUnicode_KIND(str2);
10902 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10903 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10904 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010905 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010906 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010908 if (slen < len1)
10909 goto nothing;
10910
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010912 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010913 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010914 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915
Victor Stinner59de0ee2011-10-07 10:01:28 +020010916 if (str1 == str2)
10917 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918
Victor Stinner49a0a212011-10-12 23:46:10 +020010919 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010920 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10921 if (maxchar < maxchar_str1)
10922 /* substring too wide to be present */
10923 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010924 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10925 /* Replacing str1 with str2 may cause a maxchar reduction in the
10926 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010927 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010928 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010931 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010933 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010935 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010936 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010937 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010938
Victor Stinner69ed0f42013-04-09 21:48:24 +020010939 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010940 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010941 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010942 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010943 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010945 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010947
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010948 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10949 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010950 }
10951 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 int rkind = skind;
10953 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010954 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 if (kind1 < rkind) {
10957 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010958 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 if (!buf1) goto error;
10960 release1 = 1;
10961 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010962 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010963 if (i < 0)
10964 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 if (rkind > kind2) {
10966 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010967 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 if (!buf2) goto error;
10969 release2 = 1;
10970 }
10971 else if (rkind < kind2) {
10972 /* widen self and buf1 */
10973 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010974 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010975 assert(buf1 != PyUnicode_DATA(str1));
10976 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010977 buf1 = PyUnicode_DATA(str1);
10978 release1 = 0;
10979 }
10980 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 if (!sbuf) goto error;
10982 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010983 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 if (!buf1) goto error;
10985 release1 = 1;
10986 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010987 u = PyUnicode_New(slen, maxchar);
10988 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010990 assert(PyUnicode_KIND(u) == rkind);
10991 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010992
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010993 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010994 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010995 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010997 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010999
11000 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020011001 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011002 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020011003 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011004 if (i == -1)
11005 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011006 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011008 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011012 }
11013 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010011015 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 int rkind = skind;
11017 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020011020 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011021 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 if (!buf1) goto error;
11023 release1 = 1;
11024 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020011025 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011026 if (n == 0)
11027 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020011029 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011030 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 if (!buf2) goto error;
11032 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020011035 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011037 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 if (!sbuf) goto error;
11039 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011040 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011041 assert(buf1 != PyUnicode_DATA(str1));
11042 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011043 buf1 = PyUnicode_DATA(str1);
11044 release1 = 0;
11045 }
11046 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 if (!buf1) goto error;
11048 release1 = 1;
11049 }
11050 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
Łukasz Langa8c1e1da2021-09-22 01:33:59 +020011051 PyUnicode_GET_LENGTH(str1)); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011052 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 PyErr_SetString(PyExc_OverflowError,
11054 "replace string is too long");
11055 goto error;
11056 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010011057 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020011058 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020011059 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020011060 goto done;
11061 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080011062 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 PyErr_SetString(PyExc_OverflowError,
11064 "replace string is too long");
11065 goto error;
11066 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011067 u = PyUnicode_New(new_size, maxchar);
11068 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020011070 assert(PyUnicode_KIND(u) == rkind);
11071 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 ires = i = 0;
11073 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011074 while (n-- > 0) {
11075 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020011076 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011077 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020011078 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011079 if (j == -1)
11080 break;
11081 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011082 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011083 memcpy(res + rkind * ires,
11084 sbuf + rkind * i,
11085 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011087 }
11088 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011090 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011092 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011098 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011099 memcpy(res + rkind * ires,
11100 sbuf + rkind * i,
11101 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020011102 }
11103 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011104 /* interleave */
11105 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011106 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011108 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011110 if (--n <= 0)
11111 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011112 memcpy(res + rkind * ires,
11113 sbuf + rkind * i,
11114 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 ires++;
11116 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011117 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011118 memcpy(res + rkind * ires,
11119 sbuf + rkind * i,
11120 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011121 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011122 }
11123
11124 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011125 unicode_adjust_maxchar(&u);
11126 if (u == NULL)
11127 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011129
11130 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011131 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11132 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11133 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011135 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011137 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011139 PyMem_Free((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011140 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011142
Benjamin Peterson29060642009-01-31 22:14:21 +000011143 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011144 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011145 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11146 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11147 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011149 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011151 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011153 PyMem_Free((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011154 return unicode_result_unchanged(self);
11155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011157 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11158 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11159 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11160 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011161 PyMem_Free((void *)sbuf);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011162 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011163 PyMem_Free((void *)buf1);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011164 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011165 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167}
11168
11169/* --- Unicode Object Methods --------------------------------------------- */
11170
INADA Naoki3ae20562017-01-16 20:41:20 +090011171/*[clinic input]
11172str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173
INADA Naoki3ae20562017-01-16 20:41:20 +090011174Return a version of the string where each word is titlecased.
11175
11176More specifically, words start with uppercased characters and all remaining
11177cased characters have lower case.
11178[clinic start generated code]*/
11179
11180static PyObject *
11181unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011182/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011184 if (PyUnicode_READY(self) == -1)
11185 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011186 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187}
11188
INADA Naoki3ae20562017-01-16 20:41:20 +090011189/*[clinic input]
11190str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191
INADA Naoki3ae20562017-01-16 20:41:20 +090011192Return a capitalized version of the string.
11193
11194More specifically, make the first character have upper case and the rest lower
11195case.
11196[clinic start generated code]*/
11197
11198static PyObject *
11199unicode_capitalize_impl(PyObject *self)
11200/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011202 if (PyUnicode_READY(self) == -1)
11203 return NULL;
11204 if (PyUnicode_GET_LENGTH(self) == 0)
11205 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011206 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207}
11208
INADA Naoki3ae20562017-01-16 20:41:20 +090011209/*[clinic input]
11210str.casefold as unicode_casefold
11211
11212Return a version of the string suitable for caseless comparisons.
11213[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011214
11215static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011216unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011217/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011218{
11219 if (PyUnicode_READY(self) == -1)
11220 return NULL;
11221 if (PyUnicode_IS_ASCII(self))
11222 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011223 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011224}
11225
11226
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011227/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011228
11229static int
11230convert_uc(PyObject *obj, void *addr)
11231{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011233
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011234 if (!PyUnicode_Check(obj)) {
11235 PyErr_Format(PyExc_TypeError,
11236 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011237 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011238 return 0;
11239 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011240 if (PyUnicode_READY(obj) < 0)
11241 return 0;
11242 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011243 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011245 return 0;
11246 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011247 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011248 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011249}
11250
INADA Naoki3ae20562017-01-16 20:41:20 +090011251/*[clinic input]
11252str.center as unicode_center
11253
11254 width: Py_ssize_t
11255 fillchar: Py_UCS4 = ' '
11256 /
11257
11258Return a centered string of length width.
11259
11260Padding is done using the specified fill character (default is a space).
11261[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262
11263static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011264unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11265/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011267 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268
Benjamin Petersonbac79492012-01-14 13:34:47 -050011269 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270 return NULL;
11271
Victor Stinnerc4b49542011-12-11 22:44:26 +010011272 if (PyUnicode_GET_LENGTH(self) >= width)
11273 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274
Victor Stinnerc4b49542011-12-11 22:44:26 +010011275 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 left = marg / 2 + (marg & width & 1);
11277
Victor Stinner9310abb2011-10-05 00:59:23 +020011278 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279}
11280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281/* This function assumes that str1 and str2 are readied by the caller. */
11282
Marc-André Lemburge5034372000-08-08 08:04:29 +000011283static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011284unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011285{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011286#define COMPARE(TYPE1, TYPE2) \
11287 do { \
11288 TYPE1* p1 = (TYPE1 *)data1; \
11289 TYPE2* p2 = (TYPE2 *)data2; \
11290 TYPE1* end = p1 + len; \
11291 Py_UCS4 c1, c2; \
11292 for (; p1 != end; p1++, p2++) { \
11293 c1 = *p1; \
11294 c2 = *p2; \
11295 if (c1 != c2) \
11296 return (c1 < c2) ? -1 : 1; \
11297 } \
11298 } \
11299 while (0)
11300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011302 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011303 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 kind1 = PyUnicode_KIND(str1);
11306 kind2 = PyUnicode_KIND(str2);
11307 data1 = PyUnicode_DATA(str1);
11308 data2 = PyUnicode_DATA(str2);
11309 len1 = PyUnicode_GET_LENGTH(str1);
11310 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011311 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011312
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011313 switch(kind1) {
11314 case PyUnicode_1BYTE_KIND:
11315 {
11316 switch(kind2) {
11317 case PyUnicode_1BYTE_KIND:
11318 {
11319 int cmp = memcmp(data1, data2, len);
11320 /* normalize result of memcmp() into the range [-1; 1] */
11321 if (cmp < 0)
11322 return -1;
11323 if (cmp > 0)
11324 return 1;
11325 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011326 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011327 case PyUnicode_2BYTE_KIND:
11328 COMPARE(Py_UCS1, Py_UCS2);
11329 break;
11330 case PyUnicode_4BYTE_KIND:
11331 COMPARE(Py_UCS1, Py_UCS4);
11332 break;
11333 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011334 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011335 }
11336 break;
11337 }
11338 case PyUnicode_2BYTE_KIND:
11339 {
11340 switch(kind2) {
11341 case PyUnicode_1BYTE_KIND:
11342 COMPARE(Py_UCS2, Py_UCS1);
11343 break;
11344 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011345 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011346 COMPARE(Py_UCS2, Py_UCS2);
11347 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011348 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011349 case PyUnicode_4BYTE_KIND:
11350 COMPARE(Py_UCS2, Py_UCS4);
11351 break;
11352 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011353 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011354 }
11355 break;
11356 }
11357 case PyUnicode_4BYTE_KIND:
11358 {
11359 switch(kind2) {
11360 case PyUnicode_1BYTE_KIND:
11361 COMPARE(Py_UCS4, Py_UCS1);
11362 break;
11363 case PyUnicode_2BYTE_KIND:
11364 COMPARE(Py_UCS4, Py_UCS2);
11365 break;
11366 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011367 {
11368#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11369 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11370 /* normalize result of wmemcmp() into the range [-1; 1] */
11371 if (cmp < 0)
11372 return -1;
11373 if (cmp > 0)
11374 return 1;
11375#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011376 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011377#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011378 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011379 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011380 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011381 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011382 }
11383 break;
11384 }
11385 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011386 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011387 }
11388
Victor Stinner770e19e2012-10-04 22:59:45 +020011389 if (len1 == len2)
11390 return 0;
11391 if (len1 < len2)
11392 return -1;
11393 else
11394 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011395
11396#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011397}
11398
Benjamin Peterson621b4302016-09-09 13:54:34 -070011399static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011400unicode_compare_eq(PyObject *str1, PyObject *str2)
11401{
11402 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011403 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011404 Py_ssize_t len;
11405 int cmp;
11406
Victor Stinnere5567ad2012-10-23 02:48:49 +020011407 len = PyUnicode_GET_LENGTH(str1);
11408 if (PyUnicode_GET_LENGTH(str2) != len)
11409 return 0;
11410 kind = PyUnicode_KIND(str1);
11411 if (PyUnicode_KIND(str2) != kind)
11412 return 0;
11413 data1 = PyUnicode_DATA(str1);
11414 data2 = PyUnicode_DATA(str2);
11415
11416 cmp = memcmp(data1, data2, len * kind);
11417 return (cmp == 0);
11418}
11419
11420
Alexander Belopolsky40018472011-02-26 01:02:56 +000011421int
11422PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11425 if (PyUnicode_READY(left) == -1 ||
11426 PyUnicode_READY(right) == -1)
11427 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011428
11429 /* a string is equal to itself */
11430 if (left == right)
11431 return 0;
11432
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011433 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011435 PyErr_Format(PyExc_TypeError,
11436 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011437 Py_TYPE(left)->tp_name,
11438 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 return -1;
11440}
11441
Martin v. Löwis5b222132007-06-10 09:51:05 +000011442int
11443PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11444{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 Py_ssize_t i;
11446 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011448 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449
Victor Stinner910337b2011-10-03 03:20:16 +020011450 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011451 if (!PyUnicode_IS_READY(uni)) {
11452 const wchar_t *ws = _PyUnicode_WSTR(uni);
11453 /* Compare Unicode string and source character set string */
11454 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11455 if (chr != ustr[i])
11456 return (chr < ustr[i]) ? -1 : 1;
11457 }
11458 /* This check keeps Python strings that end in '\0' from comparing equal
11459 to C strings identical up to that point. */
11460 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11461 return 1; /* uni is longer */
11462 if (ustr[i])
11463 return -1; /* str is longer */
11464 return 0;
11465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011467 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011468 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011469 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011470 size_t len, len2 = strlen(str);
11471 int cmp;
11472
11473 len = Py_MIN(len1, len2);
11474 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011475 if (cmp != 0) {
11476 if (cmp < 0)
11477 return -1;
11478 else
11479 return 1;
11480 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011481 if (len1 > len2)
11482 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011483 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011484 return -1; /* str is longer */
11485 return 0;
11486 }
11487 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011488 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011489 /* Compare Unicode string and source character set string */
11490 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011491 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011492 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11493 /* This check keeps Python strings that end in '\0' from comparing equal
11494 to C strings identical up to that point. */
11495 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11496 return 1; /* uni is longer */
11497 if (str[i])
11498 return -1; /* str is longer */
11499 return 0;
11500 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011501}
11502
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011503static int
11504non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11505{
11506 size_t i, len;
11507 const wchar_t *p;
11508 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11509 if (strlen(str) != len)
11510 return 0;
11511 p = _PyUnicode_WSTR(unicode);
11512 assert(p);
11513 for (i = 0; i < len; i++) {
11514 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011515 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011516 return 0;
11517 }
11518 return 1;
11519}
11520
11521int
11522_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11523{
11524 size_t len;
11525 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011526 assert(str);
11527#ifndef NDEBUG
11528 for (const char *p = str; *p; p++) {
11529 assert((unsigned char)*p < 128);
11530 }
11531#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011532 if (PyUnicode_READY(unicode) == -1) {
11533 /* Memory error or bad data */
11534 PyErr_Clear();
11535 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11536 }
11537 if (!PyUnicode_IS_ASCII(unicode))
11538 return 0;
11539 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11540 return strlen(str) == len &&
11541 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11542}
11543
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011544int
11545_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11546{
11547 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011548
11549 assert(_PyUnicode_CHECK(left));
11550 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011551#ifndef NDEBUG
11552 for (const char *p = right->string; *p; p++) {
11553 assert((unsigned char)*p < 128);
11554 }
11555#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011556
11557 if (PyUnicode_READY(left) == -1) {
11558 /* memory error or bad data */
11559 PyErr_Clear();
11560 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11561 }
11562
11563 if (!PyUnicode_IS_ASCII(left))
11564 return 0;
11565
11566 right_uni = _PyUnicode_FromId(right); /* borrowed */
11567 if (right_uni == NULL) {
11568 /* memory error or bad data */
11569 PyErr_Clear();
11570 return _PyUnicode_EqualToASCIIString(left, right->string);
11571 }
11572
11573 if (left == right_uni)
11574 return 1;
11575
11576 if (PyUnicode_CHECK_INTERNED(left))
11577 return 0;
11578
INADA Naoki7cc95f52018-01-28 02:07:09 +090011579 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011580 Py_hash_t hash = _PyUnicode_HASH(left);
Victor Stinnerea251802020-12-26 02:58:33 +010011581 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011582 return 0;
Victor Stinnerea251802020-12-26 02:58:33 +010011583 }
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011584
11585 return unicode_compare_eq(left, right_uni);
11586}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011587
Alexander Belopolsky40018472011-02-26 01:02:56 +000011588PyObject *
11589PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011590{
11591 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011592
Victor Stinnere5567ad2012-10-23 02:48:49 +020011593 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11594 Py_RETURN_NOTIMPLEMENTED;
11595
11596 if (PyUnicode_READY(left) == -1 ||
11597 PyUnicode_READY(right) == -1)
11598 return NULL;
11599
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011600 if (left == right) {
11601 switch (op) {
11602 case Py_EQ:
11603 case Py_LE:
11604 case Py_GE:
11605 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011606 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011607 case Py_NE:
11608 case Py_LT:
11609 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011610 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011611 default:
11612 PyErr_BadArgument();
11613 return NULL;
11614 }
11615 }
11616 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011617 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011618 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011619 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011620 }
11621 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011622 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011623 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011624 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011625}
11626
Alexander Belopolsky40018472011-02-26 01:02:56 +000011627int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011628_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11629{
11630 return unicode_eq(aa, bb);
11631}
11632
11633int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011634PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011635{
Victor Stinner77282cb2013-04-14 19:22:47 +020011636 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011637 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011639 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011640
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011641 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011643 "'in <string>' requires string as left operand, not %.100s",
11644 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011645 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011646 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011647 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011648 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011649 if (ensure_unicode(str) < 0)
11650 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011653 kind2 = PyUnicode_KIND(substr);
11654 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011655 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011657 len2 = PyUnicode_GET_LENGTH(substr);
11658 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011659 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011660 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011661 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011662 if (len2 == 1) {
11663 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11664 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011665 return result;
11666 }
11667 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011668 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011669 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011670 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672
Victor Stinner77282cb2013-04-14 19:22:47 +020011673 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 case PyUnicode_1BYTE_KIND:
11675 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11676 break;
11677 case PyUnicode_2BYTE_KIND:
11678 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11679 break;
11680 case PyUnicode_4BYTE_KIND:
11681 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11682 break;
11683 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011684 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011686
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011687 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011688 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011689 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690
Guido van Rossum403d68b2000-03-13 15:55:09 +000011691 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011692}
11693
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694/* Concat to string or Unicode object giving a new Unicode object. */
11695
Alexander Belopolsky40018472011-02-26 01:02:56 +000011696PyObject *
11697PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011699 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011700 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011701 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011703 if (ensure_unicode(left) < 0)
11704 return NULL;
11705
11706 if (!PyUnicode_Check(right)) {
11707 PyErr_Format(PyExc_TypeError,
11708 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011709 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011710 return NULL;
11711 }
11712 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714
11715 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011716 PyObject *empty = unicode_get_empty(); // Borrowed reference
11717 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011718 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011719 }
11720 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011721 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011724 left_len = PyUnicode_GET_LENGTH(left);
11725 right_len = PyUnicode_GET_LENGTH(right);
11726 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011727 PyErr_SetString(PyExc_OverflowError,
11728 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011729 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011730 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011731 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011732
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011733 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11734 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011735 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011738 result = PyUnicode_New(new_len, maxchar);
11739 if (result == NULL)
11740 return NULL;
11741 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11742 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11743 assert(_PyUnicode_CheckConsistency(result, 1));
11744 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745}
11746
Walter Dörwald1ab83302007-05-18 17:15:44 +000011747void
Victor Stinner23e56682011-10-03 03:54:37 +020011748PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011749{
Victor Stinner23e56682011-10-03 03:54:37 +020011750 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011751 Py_UCS4 maxchar, maxchar2;
11752 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011753
11754 if (p_left == NULL) {
11755 if (!PyErr_Occurred())
11756 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011757 return;
11758 }
Victor Stinner23e56682011-10-03 03:54:37 +020011759 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011760 if (right == NULL || left == NULL
11761 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011762 if (!PyErr_Occurred())
11763 PyErr_BadInternalCall();
11764 goto error;
11765 }
11766
Benjamin Petersonbac79492012-01-14 13:34:47 -050011767 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011768 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011769 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011770 goto error;
11771
Victor Stinner488fa492011-12-12 00:01:39 +010011772 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011773 PyObject *empty = unicode_get_empty(); // Borrowed reference
11774 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011775 Py_DECREF(left);
11776 Py_INCREF(right);
11777 *p_left = right;
11778 return;
11779 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011780 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011781 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011782 }
Victor Stinner488fa492011-12-12 00:01:39 +010011783
11784 left_len = PyUnicode_GET_LENGTH(left);
11785 right_len = PyUnicode_GET_LENGTH(right);
11786 if (left_len > PY_SSIZE_T_MAX - right_len) {
11787 PyErr_SetString(PyExc_OverflowError,
11788 "strings are too large to concat");
11789 goto error;
11790 }
11791 new_len = left_len + right_len;
11792
11793 if (unicode_modifiable(left)
11794 && PyUnicode_CheckExact(right)
11795 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011796 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11797 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011798 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011799 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011800 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11801 {
11802 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011803 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011804 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011805
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011806 /* copy 'right' into the newly allocated area of 'left' */
11807 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011808 }
Victor Stinner488fa492011-12-12 00:01:39 +010011809 else {
11810 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11811 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011812 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011813
Victor Stinner488fa492011-12-12 00:01:39 +010011814 /* Concat the two Unicode strings */
11815 res = PyUnicode_New(new_len, maxchar);
11816 if (res == NULL)
11817 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011818 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11819 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011820 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011821 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011822 }
11823 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011824 return;
11825
11826error:
Victor Stinner488fa492011-12-12 00:01:39 +010011827 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011828}
11829
11830void
11831PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11832{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011833 PyUnicode_Append(pleft, right);
11834 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011835}
11836
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011837/*
11838Wraps stringlib_parse_args_finds() and additionally ensures that the
11839first argument is a unicode object.
11840*/
11841
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011842static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011843parse_args_finds_unicode(const char * function_name, PyObject *args,
11844 PyObject **substring,
11845 Py_ssize_t *start, Py_ssize_t *end)
11846{
11847 if(stringlib_parse_args_finds(function_name, args, substring,
11848 start, end)) {
11849 if (ensure_unicode(*substring) < 0)
11850 return 0;
11851 return 1;
11852 }
11853 return 0;
11854}
11855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011856PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011857 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011859Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011860string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011861interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862
11863static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011864unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011866 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011867 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011868 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011870 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011871 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011874 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011875 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 kind1 = PyUnicode_KIND(self);
11878 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011879 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011880 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 len1 = PyUnicode_GET_LENGTH(self);
11883 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011885 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011886 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011887
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011888 buf1 = PyUnicode_DATA(self);
11889 buf2 = PyUnicode_DATA(substring);
11890 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011891 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011892 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011893 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011894 }
11895 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 case PyUnicode_1BYTE_KIND:
11897 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011898 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 buf2, len2, PY_SSIZE_T_MAX
11900 );
11901 break;
11902 case PyUnicode_2BYTE_KIND:
11903 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011904 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 buf2, len2, PY_SSIZE_T_MAX
11906 );
11907 break;
11908 case PyUnicode_4BYTE_KIND:
11909 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011910 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 buf2, len2, PY_SSIZE_T_MAX
11912 );
11913 break;
11914 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011915 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 }
11917
11918 result = PyLong_FromSsize_t(iresult);
11919
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011920 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011921 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011922 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 return result;
11925}
11926
INADA Naoki3ae20562017-01-16 20:41:20 +090011927/*[clinic input]
11928str.encode as unicode_encode
11929
11930 encoding: str(c_default="NULL") = 'utf-8'
11931 The encoding in which to encode the string.
11932 errors: str(c_default="NULL") = 'strict'
11933 The error handling scheme to use for encoding errors.
11934 The default is 'strict' meaning that encoding errors raise a
11935 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11936 'xmlcharrefreplace' as well as any other name registered with
11937 codecs.register_error that can handle UnicodeEncodeErrors.
11938
11939Encode the string using the codec registered for encoding.
11940[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941
11942static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011943unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011944/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011946 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011947}
11948
INADA Naoki3ae20562017-01-16 20:41:20 +090011949/*[clinic input]
11950str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951
INADA Naoki3ae20562017-01-16 20:41:20 +090011952 tabsize: int = 8
11953
11954Return a copy where all tab characters are expanded using spaces.
11955
11956If tabsize is not given, a tab size of 8 characters is assumed.
11957[clinic start generated code]*/
11958
11959static PyObject *
11960unicode_expandtabs_impl(PyObject *self, int tabsize)
11961/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011963 Py_ssize_t i, j, line_pos, src_len, incr;
11964 Py_UCS4 ch;
11965 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011966 const void *src_data;
11967 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011968 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011969 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970
Antoine Pitrou22425222011-10-04 19:10:51 +020011971 if (PyUnicode_READY(self) == -1)
11972 return NULL;
11973
Thomas Wouters7e474022000-07-16 12:04:32 +000011974 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011975 src_len = PyUnicode_GET_LENGTH(self);
11976 i = j = line_pos = 0;
11977 kind = PyUnicode_KIND(self);
11978 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011979 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011980 for (; i < src_len; i++) {
11981 ch = PyUnicode_READ(kind, src_data, i);
11982 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011983 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011984 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011985 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011987 goto overflow;
11988 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011990 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011994 goto overflow;
11995 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011997 if (ch == '\n' || ch == '\r')
11998 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020012000 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010012001 if (!found)
12002 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000012003
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020012005 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006 if (!u)
12007 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020012008 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009
Antoine Pitroue71d5742011-10-04 15:55:09 +020012010 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
Antoine Pitroue71d5742011-10-04 15:55:09 +020012012 for (; i < src_len; i++) {
12013 ch = PyUnicode_READ(kind, src_data, i);
12014 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020012016 incr = tabsize - (line_pos % tabsize);
12017 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010012018 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010012019 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012021 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020012023 line_pos++;
12024 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012025 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020012026 if (ch == '\n' || ch == '\r')
12027 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020012029 }
12030 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010012031 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012032
Antoine Pitroue71d5742011-10-04 15:55:09 +020012033 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012034 PyErr_SetString(PyExc_OverflowError, "new string is too long");
12035 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036}
12037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012038PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012039 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040\n\
12041Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012042such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043arguments start and end are interpreted as in slice notation.\n\
12044\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012045Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046
12047static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012050 /* initialize variables to prevent gcc warning */
12051 PyObject *substring = NULL;
12052 Py_ssize_t start = 0;
12053 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012054 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012056 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012059 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012062 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 if (result == -2)
12065 return NULL;
12066
Christian Heimes217cfd12007-12-02 14:31:20 +000012067 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068}
12069
12070static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012071unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012073 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012074 enum PyUnicode_Kind kind;
12075 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012076
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012077 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012078 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012080 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012081 if (PyUnicode_READY(self) == -1) {
12082 return NULL;
12083 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012084 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12085 PyErr_SetString(PyExc_IndexError, "string index out of range");
12086 return NULL;
12087 }
12088 kind = PyUnicode_KIND(self);
12089 data = PyUnicode_DATA(self);
12090 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010012091 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092}
12093
Guido van Rossumc2504932007-09-18 19:42:40 +000012094/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010012095 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000012096static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012097unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080012099 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000012100
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012101#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050012102 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012103#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 if (_PyUnicode_HASH(self) != -1)
12105 return _PyUnicode_HASH(self);
12106 if (PyUnicode_READY(self) == -1)
12107 return -1;
animalizea1d14252019-01-02 20:16:06 +080012108
Christian Heimes985ecdc2013-11-20 11:46:18 +010012109 x = _Py_HashBytes(PyUnicode_DATA(self),
12110 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000012112 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113}
12114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012115PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117\n\
oldkaa0735f2018-02-02 16:52:55 +080012118Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012119such that sub is contained within S[start:end]. Optional\n\
12120arguments start and end are interpreted as in slice notation.\n\
12121\n\
12122Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123
12124static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012127 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012128 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012129 PyObject *substring = NULL;
12130 Py_ssize_t start = 0;
12131 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012133 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012136 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012139 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 if (result == -2)
12142 return NULL;
12143
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144 if (result < 0) {
12145 PyErr_SetString(PyExc_ValueError, "substring not found");
12146 return NULL;
12147 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012148
Christian Heimes217cfd12007-12-02 14:31:20 +000012149 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150}
12151
INADA Naoki3ae20562017-01-16 20:41:20 +090012152/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012153str.isascii as unicode_isascii
12154
12155Return True if all characters in the string are ASCII, False otherwise.
12156
12157ASCII characters have code points in the range U+0000-U+007F.
12158Empty string is ASCII too.
12159[clinic start generated code]*/
12160
12161static PyObject *
12162unicode_isascii_impl(PyObject *self)
12163/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12164{
12165 if (PyUnicode_READY(self) == -1) {
12166 return NULL;
12167 }
12168 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12169}
12170
12171/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012172str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173
INADA Naoki3ae20562017-01-16 20:41:20 +090012174Return True if the string is a lowercase string, False otherwise.
12175
12176A string is lowercase if all cased characters in the string are lowercase and
12177there is at least one cased character in the string.
12178[clinic start generated code]*/
12179
12180static PyObject *
12181unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012182/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 Py_ssize_t i, length;
12185 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012186 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187 int cased;
12188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 if (PyUnicode_READY(self) == -1)
12190 return NULL;
12191 length = PyUnicode_GET_LENGTH(self);
12192 kind = PyUnicode_KIND(self);
12193 data = PyUnicode_DATA(self);
12194
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 if (length == 1)
12197 return PyBool_FromLong(
12198 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012200 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012202 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012203
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 for (i = 0; i < length; i++) {
12206 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012207
Benjamin Peterson29060642009-01-31 22:14:21 +000012208 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012209 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012210 else if (!cased && Py_UNICODE_ISLOWER(ch))
12211 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012213 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214}
12215
INADA Naoki3ae20562017-01-16 20:41:20 +090012216/*[clinic input]
12217str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218
INADA Naoki3ae20562017-01-16 20:41:20 +090012219Return True if the string is an uppercase string, False otherwise.
12220
12221A string is uppercase if all cased characters in the string are uppercase and
12222there is at least one cased character in the string.
12223[clinic start generated code]*/
12224
12225static PyObject *
12226unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012227/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 Py_ssize_t i, length;
12230 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012231 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 int cased;
12233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 if (PyUnicode_READY(self) == -1)
12235 return NULL;
12236 length = PyUnicode_GET_LENGTH(self);
12237 kind = PyUnicode_KIND(self);
12238 data = PyUnicode_DATA(self);
12239
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 if (length == 1)
12242 return PyBool_FromLong(
12243 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012245 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012247 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012248
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 for (i = 0; i < length; i++) {
12251 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012252
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012254 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 else if (!cased && Py_UNICODE_ISUPPER(ch))
12256 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012258 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259}
12260
INADA Naoki3ae20562017-01-16 20:41:20 +090012261/*[clinic input]
12262str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263
INADA Naoki3ae20562017-01-16 20:41:20 +090012264Return True if the string is a title-cased string, False otherwise.
12265
12266In a title-cased string, upper- and title-case characters may only
12267follow uncased characters and lowercase characters only cased ones.
12268[clinic start generated code]*/
12269
12270static PyObject *
12271unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012272/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 Py_ssize_t i, length;
12275 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012276 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277 int cased, previous_is_cased;
12278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 if (PyUnicode_READY(self) == -1)
12280 return NULL;
12281 length = PyUnicode_GET_LENGTH(self);
12282 kind = PyUnicode_KIND(self);
12283 data = PyUnicode_DATA(self);
12284
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 if (length == 1) {
12287 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12288 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12289 (Py_UNICODE_ISUPPER(ch) != 0));
12290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012292 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012294 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012295
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296 cased = 0;
12297 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 for (i = 0; i < length; i++) {
12299 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012300
Benjamin Peterson29060642009-01-31 22:14:21 +000012301 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12302 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012303 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 previous_is_cased = 1;
12305 cased = 1;
12306 }
12307 else if (Py_UNICODE_ISLOWER(ch)) {
12308 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012309 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012310 previous_is_cased = 1;
12311 cased = 1;
12312 }
12313 else
12314 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012315 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012316 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317}
12318
INADA Naoki3ae20562017-01-16 20:41:20 +090012319/*[clinic input]
12320str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321
INADA Naoki3ae20562017-01-16 20:41:20 +090012322Return True if the string is a whitespace string, False otherwise.
12323
12324A string is whitespace if all characters in the string are whitespace and there
12325is at least one character in the string.
12326[clinic start generated code]*/
12327
12328static PyObject *
12329unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012330/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 Py_ssize_t i, length;
12333 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012334 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335
12336 if (PyUnicode_READY(self) == -1)
12337 return NULL;
12338 length = PyUnicode_GET_LENGTH(self);
12339 kind = PyUnicode_KIND(self);
12340 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 if (length == 1)
12344 return PyBool_FromLong(
12345 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012347 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012349 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 for (i = 0; i < length; i++) {
12352 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012353 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012354 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012356 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357}
12358
INADA Naoki3ae20562017-01-16 20:41:20 +090012359/*[clinic input]
12360str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012361
INADA Naoki3ae20562017-01-16 20:41:20 +090012362Return True if the string is an alphabetic string, False otherwise.
12363
12364A string is alphabetic if all characters in the string are alphabetic and there
12365is at least one character in the string.
12366[clinic start generated code]*/
12367
12368static PyObject *
12369unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012370/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012371{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 Py_ssize_t i, length;
12373 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012374 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375
12376 if (PyUnicode_READY(self) == -1)
12377 return NULL;
12378 length = PyUnicode_GET_LENGTH(self);
12379 kind = PyUnicode_KIND(self);
12380 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012381
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012382 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383 if (length == 1)
12384 return PyBool_FromLong(
12385 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012386
12387 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012389 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 for (i = 0; i < length; i++) {
12392 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012393 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012394 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012395 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012396}
12397
INADA Naoki3ae20562017-01-16 20:41:20 +090012398/*[clinic input]
12399str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012400
INADA Naoki3ae20562017-01-16 20:41:20 +090012401Return True if the string is an alpha-numeric string, False otherwise.
12402
12403A string is alpha-numeric if all characters in the string are alpha-numeric and
12404there is at least one character in the string.
12405[clinic start generated code]*/
12406
12407static PyObject *
12408unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012409/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012412 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 Py_ssize_t len, i;
12414
12415 if (PyUnicode_READY(self) == -1)
12416 return NULL;
12417
12418 kind = PyUnicode_KIND(self);
12419 data = PyUnicode_DATA(self);
12420 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012421
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012422 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 if (len == 1) {
12424 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12425 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12426 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012427
12428 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012430 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 for (i = 0; i < len; i++) {
12433 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012434 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012435 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012436 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012437 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012438}
12439
INADA Naoki3ae20562017-01-16 20:41:20 +090012440/*[clinic input]
12441str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442
INADA Naoki3ae20562017-01-16 20:41:20 +090012443Return True if the string is a decimal string, False otherwise.
12444
12445A string is a decimal string if all characters in the string are decimal and
12446there is at least one character in the string.
12447[clinic start generated code]*/
12448
12449static PyObject *
12450unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012451/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 Py_ssize_t i, length;
12454 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012455 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456
12457 if (PyUnicode_READY(self) == -1)
12458 return NULL;
12459 length = PyUnicode_GET_LENGTH(self);
12460 kind = PyUnicode_KIND(self);
12461 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 if (length == 1)
12465 return PyBool_FromLong(
12466 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012468 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012470 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 for (i = 0; i < length; i++) {
12473 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012474 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012476 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477}
12478
INADA Naoki3ae20562017-01-16 20:41:20 +090012479/*[clinic input]
12480str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481
INADA Naoki3ae20562017-01-16 20:41:20 +090012482Return True if the string is a digit string, False otherwise.
12483
12484A string is a digit string if all characters in the string are digits and there
12485is at least one character in the string.
12486[clinic start generated code]*/
12487
12488static PyObject *
12489unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012490/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 Py_ssize_t i, length;
12493 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012494 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495
12496 if (PyUnicode_READY(self) == -1)
12497 return NULL;
12498 length = PyUnicode_GET_LENGTH(self);
12499 kind = PyUnicode_KIND(self);
12500 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 if (length == 1) {
12504 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12505 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012508 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012510 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 for (i = 0; i < length; i++) {
12513 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012514 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012516 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517}
12518
INADA Naoki3ae20562017-01-16 20:41:20 +090012519/*[clinic input]
12520str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
INADA Naoki3ae20562017-01-16 20:41:20 +090012522Return True if the string is a numeric string, False otherwise.
12523
12524A string is numeric if all characters in the string are numeric and there is at
12525least one character in the string.
12526[clinic start generated code]*/
12527
12528static PyObject *
12529unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012530/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 Py_ssize_t i, length;
12533 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012534 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535
12536 if (PyUnicode_READY(self) == -1)
12537 return NULL;
12538 length = PyUnicode_GET_LENGTH(self);
12539 kind = PyUnicode_KIND(self);
12540 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 if (length == 1)
12544 return PyBool_FromLong(
12545 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012547 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012549 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 for (i = 0; i < length; i++) {
12552 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012553 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012555 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556}
12557
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012558Py_ssize_t
12559_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012562 if (PyUnicode_READY(self) == -1)
12563 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012564
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012565 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012566 if (len == 0) {
12567 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012568 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 }
12570
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012571 int kind = PyUnicode_KIND(self);
12572 const void *data = PyUnicode_DATA(self);
12573 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012574 /* PEP 3131 says that the first character must be in
12575 XID_Start and subsequent characters in XID_Continue,
12576 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012577 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012578 letters, digits, underscore). However, given the current
12579 definition of XID_Start and XID_Continue, it is sufficient
12580 to check just for these, except that _ must be allowed
12581 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012582 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012583 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012584 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012585
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012586 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012587 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012588 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012589 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012590 }
12591 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012592 return i;
12593}
12594
12595int
12596PyUnicode_IsIdentifier(PyObject *self)
12597{
12598 if (PyUnicode_IS_READY(self)) {
12599 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12600 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12601 /* an empty string is not a valid identifier */
12602 return len && i == len;
12603 }
12604 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012605_Py_COMP_DIAG_PUSH
12606_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012607 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012608 if (len == 0) {
12609 /* an empty string is not a valid identifier */
12610 return 0;
12611 }
12612
12613 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012614 Py_UCS4 ch = wstr[i++];
12615#if SIZEOF_WCHAR_T == 2
12616 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12617 && i < len
12618 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12619 {
12620 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12621 i++;
12622 }
12623#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012624 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12625 return 0;
12626 }
12627
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012628 while (i < len) {
12629 ch = wstr[i++];
12630#if SIZEOF_WCHAR_T == 2
12631 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12632 && i < len
12633 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12634 {
12635 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12636 i++;
12637 }
12638#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012639 if (!_PyUnicode_IsXidContinue(ch)) {
12640 return 0;
12641 }
12642 }
12643 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012644_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012645 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012646}
12647
INADA Naoki3ae20562017-01-16 20:41:20 +090012648/*[clinic input]
12649str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012650
INADA Naoki3ae20562017-01-16 20:41:20 +090012651Return True if the string is a valid Python identifier, False otherwise.
12652
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012653Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012654such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012655[clinic start generated code]*/
12656
12657static PyObject *
12658unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012659/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012660{
12661 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12662}
12663
INADA Naoki3ae20562017-01-16 20:41:20 +090012664/*[clinic input]
12665str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012666
INADA Naoki3ae20562017-01-16 20:41:20 +090012667Return True if the string is printable, False otherwise.
12668
12669A string is printable if all of its characters are considered printable in
12670repr() or if it is empty.
12671[clinic start generated code]*/
12672
12673static PyObject *
12674unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012675/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 Py_ssize_t i, length;
12678 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012679 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680
12681 if (PyUnicode_READY(self) == -1)
12682 return NULL;
12683 length = PyUnicode_GET_LENGTH(self);
12684 kind = PyUnicode_KIND(self);
12685 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012686
12687 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 if (length == 1)
12689 return PyBool_FromLong(
12690 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 for (i = 0; i < length; i++) {
12693 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012694 Py_RETURN_FALSE;
12695 }
12696 }
12697 Py_RETURN_TRUE;
12698}
12699
INADA Naoki3ae20562017-01-16 20:41:20 +090012700/*[clinic input]
12701str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702
INADA Naoki3ae20562017-01-16 20:41:20 +090012703 iterable: object
12704 /
12705
12706Concatenate any number of strings.
12707
Martin Panter91a88662017-01-24 00:30:06 +000012708The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012709The result is returned as a new string.
12710
12711Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12712[clinic start generated code]*/
12713
12714static PyObject *
12715unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012716/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717{
INADA Naoki3ae20562017-01-16 20:41:20 +090012718 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719}
12720
Martin v. Löwis18e16552006-02-15 17:27:45 +000012721static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012722unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 if (PyUnicode_READY(self) == -1)
12725 return -1;
12726 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727}
12728
INADA Naoki3ae20562017-01-16 20:41:20 +090012729/*[clinic input]
12730str.ljust as unicode_ljust
12731
12732 width: Py_ssize_t
12733 fillchar: Py_UCS4 = ' '
12734 /
12735
12736Return a left-justified string of length width.
12737
12738Padding is done using the specified fill character (default is a space).
12739[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740
12741static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012742unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12743/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012745 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747
Victor Stinnerc4b49542011-12-11 22:44:26 +010012748 if (PyUnicode_GET_LENGTH(self) >= width)
12749 return unicode_result_unchanged(self);
12750
12751 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752}
12753
INADA Naoki3ae20562017-01-16 20:41:20 +090012754/*[clinic input]
12755str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756
INADA Naoki3ae20562017-01-16 20:41:20 +090012757Return a copy of the string converted to lowercase.
12758[clinic start generated code]*/
12759
12760static PyObject *
12761unicode_lower_impl(PyObject *self)
12762/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012763{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012764 if (PyUnicode_READY(self) == -1)
12765 return NULL;
12766 if (PyUnicode_IS_ASCII(self))
12767 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012768 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769}
12770
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012771#define LEFTSTRIP 0
12772#define RIGHTSTRIP 1
12773#define BOTHSTRIP 2
12774
12775/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012776static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012777
INADA Naoki3ae20562017-01-16 20:41:20 +090012778#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012779
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012780/* externally visible for str.strip(unicode) */
12781PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012782_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012783{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012784 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 int kind;
12786 Py_ssize_t i, j, len;
12787 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012788 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12791 return NULL;
12792
12793 kind = PyUnicode_KIND(self);
12794 data = PyUnicode_DATA(self);
12795 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012796 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12798 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012799 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012800
Benjamin Peterson14339b62009-01-31 16:36:08 +000012801 i = 0;
12802 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012803 while (i < len) {
12804 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12805 if (!BLOOM(sepmask, ch))
12806 break;
12807 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12808 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012809 i++;
12810 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012811 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012812
Benjamin Peterson14339b62009-01-31 16:36:08 +000012813 j = len;
12814 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012815 j--;
12816 while (j >= i) {
12817 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12818 if (!BLOOM(sepmask, ch))
12819 break;
12820 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12821 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012823 }
12824
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012826 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012827
Victor Stinner7931d9a2011-11-04 00:22:48 +010012828 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012829}
12830
12831PyObject*
12832PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12833{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012834 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012836 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012837
Victor Stinnerde636f32011-10-01 03:55:54 +020012838 if (PyUnicode_READY(self) == -1)
12839 return NULL;
12840
Victor Stinner684d5fd2012-05-03 02:32:34 +020012841 length = PyUnicode_GET_LENGTH(self);
12842 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012843
Victor Stinner684d5fd2012-05-03 02:32:34 +020012844 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012845 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846
Victor Stinnerde636f32011-10-01 03:55:54 +020012847 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012848 PyErr_SetString(PyExc_IndexError, "string index out of range");
12849 return NULL;
12850 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012851 if (start >= length || end < start)
12852 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012853
Victor Stinner684d5fd2012-05-03 02:32:34 +020012854 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012855 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012856 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012857 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012858 }
12859 else {
12860 kind = PyUnicode_KIND(self);
12861 data = PyUnicode_1BYTE_DATA(self);
12862 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012863 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012864 length);
12865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867
12868static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012869do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 Py_ssize_t len, i, j;
12872
12873 if (PyUnicode_READY(self) == -1)
12874 return NULL;
12875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012877
Victor Stinnercc7af722013-04-09 22:39:24 +020012878 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012879 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012880
12881 i = 0;
12882 if (striptype != RIGHTSTRIP) {
12883 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012884 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012885 if (!_Py_ascii_whitespace[ch])
12886 break;
12887 i++;
12888 }
12889 }
12890
12891 j = len;
12892 if (striptype != LEFTSTRIP) {
12893 j--;
12894 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012895 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012896 if (!_Py_ascii_whitespace[ch])
12897 break;
12898 j--;
12899 }
12900 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012901 }
12902 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012903 else {
12904 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012905 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012906
Victor Stinnercc7af722013-04-09 22:39:24 +020012907 i = 0;
12908 if (striptype != RIGHTSTRIP) {
12909 while (i < len) {
12910 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12911 if (!Py_UNICODE_ISSPACE(ch))
12912 break;
12913 i++;
12914 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012915 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012916
12917 j = len;
12918 if (striptype != LEFTSTRIP) {
12919 j--;
12920 while (j >= i) {
12921 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12922 if (!Py_UNICODE_ISSPACE(ch))
12923 break;
12924 j--;
12925 }
12926 j++;
12927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012928 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012929
Victor Stinner7931d9a2011-11-04 00:22:48 +010012930 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012931}
12932
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012933
12934static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012935do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012936{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012937 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012938 if (PyUnicode_Check(sep))
12939 return _PyUnicode_XStrip(self, striptype, sep);
12940 else {
12941 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 "%s arg must be None or str",
12943 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012944 return NULL;
12945 }
12946 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012947
Benjamin Peterson14339b62009-01-31 16:36:08 +000012948 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012949}
12950
12951
INADA Naoki3ae20562017-01-16 20:41:20 +090012952/*[clinic input]
12953str.strip as unicode_strip
12954
12955 chars: object = None
12956 /
12957
Zachary Ware09895c22019-10-09 16:09:00 -050012958Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012959
12960If chars is given and not None, remove characters in chars instead.
12961[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012962
12963static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012964unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012965/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012966{
INADA Naoki3ae20562017-01-16 20:41:20 +090012967 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012968}
12969
12970
INADA Naoki3ae20562017-01-16 20:41:20 +090012971/*[clinic input]
12972str.lstrip as unicode_lstrip
12973
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012974 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012975 /
12976
12977Return a copy of the string with leading whitespace removed.
12978
12979If chars is given and not None, remove characters in chars instead.
12980[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012981
12982static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012983unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012984/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012985{
INADA Naoki3ae20562017-01-16 20:41:20 +090012986 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012987}
12988
12989
INADA Naoki3ae20562017-01-16 20:41:20 +090012990/*[clinic input]
12991str.rstrip as unicode_rstrip
12992
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012993 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012994 /
12995
12996Return a copy of the string with trailing whitespace removed.
12997
12998If chars is given and not None, remove characters in chars instead.
12999[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013000
13001static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013002unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030013003/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013004{
INADA Naoki3ae20562017-01-16 20:41:20 +090013005 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013006}
13007
13008
Guido van Rossumd57fd912000-03-10 22:53:23 +000013009static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013010unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013012 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013014
Serhiy Storchaka05997252013-01-26 12:14:02 +020013015 if (len < 1)
13016 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017
Victor Stinnerc4b49542011-12-11 22:44:26 +010013018 /* no repeat, return original string */
13019 if (len == 1)
13020 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000013021
Benjamin Petersonbac79492012-01-14 13:34:47 -050013022 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023 return NULL;
13024
Victor Stinnerc759f3e2011-10-01 03:09:58 +020013025 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020013026 PyErr_SetString(PyExc_OverflowError,
13027 "repeated string is too long");
13028 return NULL;
13029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020013031
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013032 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013033 if (!u)
13034 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020013035 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013038 int kind = PyUnicode_KIND(str);
13039 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010013040 if (kind == PyUnicode_1BYTE_KIND) {
13041 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020013042 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010013043 }
13044 else if (kind == PyUnicode_2BYTE_KIND) {
13045 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020013046 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010013047 ucs2[n] = fill_char;
13048 } else {
13049 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
13050 assert(kind == PyUnicode_4BYTE_KIND);
13051 for (n = 0; n < len; ++n)
13052 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020013053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 }
13055 else {
13056 /* number of characters copied this far */
13057 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013058 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020013060 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013061 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020013064 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013065 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000013066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067 }
13068
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013069 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013070 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071}
13072
Alexander Belopolsky40018472011-02-26 01:02:56 +000013073PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013074PyUnicode_Replace(PyObject *str,
13075 PyObject *substr,
13076 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000013077 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013079 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13080 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013081 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013082 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083}
13084
INADA Naoki3ae20562017-01-16 20:41:20 +090013085/*[clinic input]
13086str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087
INADA Naoki3ae20562017-01-16 20:41:20 +090013088 old: unicode
13089 new: unicode
13090 count: Py_ssize_t = -1
13091 Maximum number of occurrences to replace.
13092 -1 (the default value) means replace all occurrences.
13093 /
13094
13095Return a copy with all occurrences of substring old replaced by new.
13096
13097If the optional argument count is given, only the first count occurrences are
13098replaced.
13099[clinic start generated code]*/
13100
13101static PyObject *
13102unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13103 Py_ssize_t count)
13104/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105{
Benjamin Peterson22a29702012-01-02 09:00:30 -060013106 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013107 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090013108 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109}
13110
sweeneydea81849b2020-04-22 17:05:48 -040013111/*[clinic input]
13112str.removeprefix as unicode_removeprefix
13113
13114 prefix: unicode
13115 /
13116
13117Return a str with the given prefix string removed if present.
13118
13119If the string starts with the prefix string, return string[len(prefix):].
13120Otherwise, return a copy of the original string.
13121[clinic start generated code]*/
13122
13123static PyObject *
13124unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13125/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13126{
13127 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13128 if (match == -1) {
13129 return NULL;
13130 }
13131 if (match) {
13132 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13133 PyUnicode_GET_LENGTH(self));
13134 }
13135 return unicode_result_unchanged(self);
13136}
13137
13138/*[clinic input]
13139str.removesuffix as unicode_removesuffix
13140
13141 suffix: unicode
13142 /
13143
13144Return a str with the given suffix string removed if present.
13145
13146If the string ends with the suffix string and that suffix is not empty,
13147return string[:-len(suffix)]. Otherwise, return a copy of the original
13148string.
13149[clinic start generated code]*/
13150
13151static PyObject *
13152unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13153/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13154{
13155 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13156 if (match == -1) {
13157 return NULL;
13158 }
13159 if (match) {
13160 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13161 - PyUnicode_GET_LENGTH(suffix));
13162 }
13163 return unicode_result_unchanged(self);
13164}
13165
Alexander Belopolsky40018472011-02-26 01:02:56 +000013166static PyObject *
13167unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013169 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013170 Py_ssize_t isize;
13171 Py_ssize_t osize, squote, dquote, i, o;
13172 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013173 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013174 const void *idata;
13175 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013178 return NULL;
13179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013180 isize = PyUnicode_GET_LENGTH(unicode);
13181 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 /* Compute length of output, quote characters, and
13184 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013185 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013186 max = 127;
13187 squote = dquote = 0;
13188 ikind = PyUnicode_KIND(unicode);
13189 for (i = 0; i < isize; i++) {
13190 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013191 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013193 case '\'': squote++; break;
13194 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013196 incr = 2;
13197 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 default:
13199 /* Fast-path ASCII */
13200 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013201 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013202 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013203 ;
13204 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013207 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013209 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013210 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013211 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013212 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013213 if (osize > PY_SSIZE_T_MAX - incr) {
13214 PyErr_SetString(PyExc_OverflowError,
13215 "string is too long to generate repr");
13216 return NULL;
13217 }
13218 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219 }
13220
13221 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013222 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013223 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013224 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013225 if (dquote)
13226 /* Both squote and dquote present. Use squote,
13227 and escape them */
13228 osize += squote;
13229 else
13230 quote = '"';
13231 }
Victor Stinner55c08782013-04-14 18:45:39 +020013232 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233
13234 repr = PyUnicode_New(osize, max);
13235 if (repr == NULL)
13236 return NULL;
13237 okind = PyUnicode_KIND(repr);
13238 odata = PyUnicode_DATA(repr);
13239
13240 PyUnicode_WRITE(okind, odata, 0, quote);
13241 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013242 if (unchanged) {
13243 _PyUnicode_FastCopyCharacters(repr, 1,
13244 unicode, 0,
13245 isize);
13246 }
13247 else {
13248 for (i = 0, o = 1; i < isize; i++) {
13249 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013250
Victor Stinner55c08782013-04-14 18:45:39 +020013251 /* Escape quotes and backslashes */
13252 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013253 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013255 continue;
13256 }
13257
13258 /* Map special whitespace to '\t', \n', '\r' */
13259 if (ch == '\t') {
13260 PyUnicode_WRITE(okind, odata, o++, '\\');
13261 PyUnicode_WRITE(okind, odata, o++, 't');
13262 }
13263 else if (ch == '\n') {
13264 PyUnicode_WRITE(okind, odata, o++, '\\');
13265 PyUnicode_WRITE(okind, odata, o++, 'n');
13266 }
13267 else if (ch == '\r') {
13268 PyUnicode_WRITE(okind, odata, o++, '\\');
13269 PyUnicode_WRITE(okind, odata, o++, 'r');
13270 }
13271
13272 /* Map non-printable US ASCII to '\xhh' */
13273 else if (ch < ' ' || ch == 0x7F) {
13274 PyUnicode_WRITE(okind, odata, o++, '\\');
13275 PyUnicode_WRITE(okind, odata, o++, 'x');
13276 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13277 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13278 }
13279
13280 /* Copy ASCII characters as-is */
13281 else if (ch < 0x7F) {
13282 PyUnicode_WRITE(okind, odata, o++, ch);
13283 }
13284
13285 /* Non-ASCII characters */
13286 else {
13287 /* Map Unicode whitespace and control characters
13288 (categories Z* and C* except ASCII space)
13289 */
13290 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13291 PyUnicode_WRITE(okind, odata, o++, '\\');
13292 /* Map 8-bit characters to '\xhh' */
13293 if (ch <= 0xff) {
13294 PyUnicode_WRITE(okind, odata, o++, 'x');
13295 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13296 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13297 }
13298 /* Map 16-bit characters to '\uxxxx' */
13299 else if (ch <= 0xffff) {
13300 PyUnicode_WRITE(okind, odata, o++, 'u');
13301 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13302 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13303 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13304 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13305 }
13306 /* Map 21-bit characters to '\U00xxxxxx' */
13307 else {
13308 PyUnicode_WRITE(okind, odata, o++, 'U');
13309 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13310 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13311 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13312 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13313 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13314 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13315 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13316 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13317 }
13318 }
13319 /* Copy characters as-is */
13320 else {
13321 PyUnicode_WRITE(okind, odata, o++, ch);
13322 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013323 }
13324 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013326 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013327 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013328 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013329}
13330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013331PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013332 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013333\n\
13334Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013335such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336arguments start and end are interpreted as in slice notation.\n\
13337\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013338Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339
13340static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013341unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013343 /* initialize variables to prevent gcc warning */
13344 PyObject *substring = NULL;
13345 Py_ssize_t start = 0;
13346 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013347 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013348
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013349 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013351
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013352 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013353 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013354
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013355 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013357 if (result == -2)
13358 return NULL;
13359
Christian Heimes217cfd12007-12-02 14:31:20 +000013360 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361}
13362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013363PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013364 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013366Return the highest index in S where substring sub is found,\n\
13367such that sub is contained within S[start:end]. Optional\n\
13368arguments start and end are interpreted as in slice notation.\n\
13369\n\
13370Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371
13372static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013373unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013374{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013375 /* initialize variables to prevent gcc warning */
13376 PyObject *substring = NULL;
13377 Py_ssize_t start = 0;
13378 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013379 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013380
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013381 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013384 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013385 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013386
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013387 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013389 if (result == -2)
13390 return NULL;
13391
Guido van Rossumd57fd912000-03-10 22:53:23 +000013392 if (result < 0) {
13393 PyErr_SetString(PyExc_ValueError, "substring not found");
13394 return NULL;
13395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013396
Christian Heimes217cfd12007-12-02 14:31:20 +000013397 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398}
13399
INADA Naoki3ae20562017-01-16 20:41:20 +090013400/*[clinic input]
13401str.rjust as unicode_rjust
13402
13403 width: Py_ssize_t
13404 fillchar: Py_UCS4 = ' '
13405 /
13406
13407Return a right-justified string of length width.
13408
13409Padding is done using the specified fill character (default is a space).
13410[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411
13412static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013413unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13414/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013415{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013416 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013417 return NULL;
13418
Victor Stinnerc4b49542011-12-11 22:44:26 +010013419 if (PyUnicode_GET_LENGTH(self) >= width)
13420 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013421
Victor Stinnerc4b49542011-12-11 22:44:26 +010013422 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423}
13424
Alexander Belopolsky40018472011-02-26 01:02:56 +000013425PyObject *
13426PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013428 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013430
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013431 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432}
13433
INADA Naoki3ae20562017-01-16 20:41:20 +090013434/*[clinic input]
13435str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436
INADA Naoki3ae20562017-01-16 20:41:20 +090013437 sep: object = None
13438 The delimiter according which to split the string.
13439 None (the default value) means split according to any whitespace,
13440 and discard empty strings from the result.
13441 maxsplit: Py_ssize_t = -1
13442 Maximum number of splits to do.
13443 -1 (the default value) means no limit.
13444
13445Return a list of the words in the string, using sep as the delimiter string.
13446[clinic start generated code]*/
13447
13448static PyObject *
13449unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13450/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013451{
INADA Naoki3ae20562017-01-16 20:41:20 +090013452 if (sep == Py_None)
13453 return split(self, NULL, maxsplit);
13454 if (PyUnicode_Check(sep))
13455 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013456
Victor Stinner998b8062018-09-12 00:23:25 +020013457 PyErr_Format(PyExc_TypeError,
13458 "must be str or None, not %.100s",
13459 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013460 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013461}
13462
Thomas Wouters477c8d52006-05-27 19:21:47 +000013463PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013464PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013465{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013466 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013467 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013468 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013469 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013470
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013471 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013473
Victor Stinner14f8f022011-10-05 20:58:25 +020013474 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013475 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013476 len1 = PyUnicode_GET_LENGTH(str_obj);
13477 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013478 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013479 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013480 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013481 }
13482 buf1 = PyUnicode_DATA(str_obj);
13483 buf2 = PyUnicode_DATA(sep_obj);
13484 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013485 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013486 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013487 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013490 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013491 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013492 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13493 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13494 else
13495 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013496 break;
13497 case PyUnicode_2BYTE_KIND:
13498 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13499 break;
13500 case PyUnicode_4BYTE_KIND:
13501 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13502 break;
13503 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013504 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013505 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013506
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013507 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013508 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013509 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013510
13511 return out;
13512}
13513
13514
13515PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013516PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013517{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013518 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013519 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013520 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013521 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013522
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013523 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013524 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013525
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013526 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013527 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013528 len1 = PyUnicode_GET_LENGTH(str_obj);
13529 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013530 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013531 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013532 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013533 }
13534 buf1 = PyUnicode_DATA(str_obj);
13535 buf2 = PyUnicode_DATA(sep_obj);
13536 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013537 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013538 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013539 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013541
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013542 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013543 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013544 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13545 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13546 else
13547 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013548 break;
13549 case PyUnicode_2BYTE_KIND:
13550 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13551 break;
13552 case PyUnicode_4BYTE_KIND:
13553 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13554 break;
13555 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013556 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013557 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013558
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013559 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013560 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013561 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013562
13563 return out;
13564}
13565
INADA Naoki3ae20562017-01-16 20:41:20 +090013566/*[clinic input]
13567str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013568
INADA Naoki3ae20562017-01-16 20:41:20 +090013569 sep: object
13570 /
13571
13572Partition the string into three parts using the given separator.
13573
13574This will search for the separator in the string. If the separator is found,
13575returns a 3-tuple containing the part before the separator, the separator
13576itself, and the part after it.
13577
13578If the separator is not found, returns a 3-tuple containing the original string
13579and two empty strings.
13580[clinic start generated code]*/
13581
13582static PyObject *
13583unicode_partition(PyObject *self, PyObject *sep)
13584/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013585{
INADA Naoki3ae20562017-01-16 20:41:20 +090013586 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013587}
13588
INADA Naoki3ae20562017-01-16 20:41:20 +090013589/*[clinic input]
13590str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013591
INADA Naoki3ae20562017-01-16 20:41:20 +090013592Partition the string into three parts using the given separator.
13593
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013594This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013595the separator is found, returns a 3-tuple containing the part before the
13596separator, the separator itself, and the part after it.
13597
13598If the separator is not found, returns a 3-tuple containing two empty strings
13599and the original string.
13600[clinic start generated code]*/
13601
13602static PyObject *
13603unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013604/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013605{
INADA Naoki3ae20562017-01-16 20:41:20 +090013606 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013607}
13608
Alexander Belopolsky40018472011-02-26 01:02:56 +000013609PyObject *
13610PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013611{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013612 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013613 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013614
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013615 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013616}
13617
INADA Naoki3ae20562017-01-16 20:41:20 +090013618/*[clinic input]
13619str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013620
INADA Naoki3ae20562017-01-16 20:41:20 +090013621Return a list of the words in the string, using sep as the delimiter string.
13622
13623Splits are done starting at the end of the string and working to the front.
13624[clinic start generated code]*/
13625
13626static PyObject *
13627unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13628/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013629{
INADA Naoki3ae20562017-01-16 20:41:20 +090013630 if (sep == Py_None)
13631 return rsplit(self, NULL, maxsplit);
13632 if (PyUnicode_Check(sep))
13633 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013634
Victor Stinner998b8062018-09-12 00:23:25 +020013635 PyErr_Format(PyExc_TypeError,
13636 "must be str or None, not %.100s",
13637 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013638 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013639}
13640
INADA Naoki3ae20562017-01-16 20:41:20 +090013641/*[clinic input]
13642str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013643
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013644 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013645
13646Return a list of the lines in the string, breaking at line boundaries.
13647
13648Line breaks are not included in the resulting list unless keepends is given and
13649true.
13650[clinic start generated code]*/
13651
13652static PyObject *
13653unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013654/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013655{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013656 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013657}
13658
13659static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013660PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013661{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013662 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013663}
13664
INADA Naoki3ae20562017-01-16 20:41:20 +090013665/*[clinic input]
13666str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013667
INADA Naoki3ae20562017-01-16 20:41:20 +090013668Convert uppercase characters to lowercase and lowercase characters to uppercase.
13669[clinic start generated code]*/
13670
13671static PyObject *
13672unicode_swapcase_impl(PyObject *self)
13673/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013674{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013675 if (PyUnicode_READY(self) == -1)
13676 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013677 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013678}
13679
Larry Hastings61272b72014-01-07 12:41:53 -080013680/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013681
Larry Hastings31826802013-10-19 00:09:25 -070013682@staticmethod
13683str.maketrans as unicode_maketrans
13684
13685 x: object
13686
13687 y: unicode=NULL
13688
13689 z: unicode=NULL
13690
13691 /
13692
13693Return a translation table usable for str.translate().
13694
13695If there is only one argument, it must be a dictionary mapping Unicode
13696ordinals (integers) or characters to Unicode ordinals, strings or None.
13697Character keys will be then converted to ordinals.
13698If there are two arguments, they must be strings of equal length, and
13699in the resulting dictionary, each character in x will be mapped to the
13700character at the same position in y. If there is a third argument, it
13701must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013702[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013703
Larry Hastings31826802013-10-19 00:09:25 -070013704static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013705unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013706/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013707{
Georg Brandlceee0772007-11-27 23:48:05 +000013708 PyObject *new = NULL, *key, *value;
13709 Py_ssize_t i = 0;
13710 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013711
Georg Brandlceee0772007-11-27 23:48:05 +000013712 new = PyDict_New();
13713 if (!new)
13714 return NULL;
13715 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013716 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013717 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013718
Georg Brandlceee0772007-11-27 23:48:05 +000013719 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013720 if (!PyUnicode_Check(x)) {
13721 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13722 "be a string if there is a second argument");
13723 goto err;
13724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013725 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013726 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13727 "arguments must have equal length");
13728 goto err;
13729 }
13730 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013731 x_kind = PyUnicode_KIND(x);
13732 y_kind = PyUnicode_KIND(y);
13733 x_data = PyUnicode_DATA(x);
13734 y_data = PyUnicode_DATA(y);
13735 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13736 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013737 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013738 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013739 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013740 if (!value) {
13741 Py_DECREF(key);
13742 goto err;
13743 }
Georg Brandlceee0772007-11-27 23:48:05 +000013744 res = PyDict_SetItem(new, key, value);
13745 Py_DECREF(key);
13746 Py_DECREF(value);
13747 if (res < 0)
13748 goto err;
13749 }
13750 /* create entries for deleting chars in z */
13751 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013752 z_kind = PyUnicode_KIND(z);
13753 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013754 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013755 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013756 if (!key)
13757 goto err;
13758 res = PyDict_SetItem(new, key, Py_None);
13759 Py_DECREF(key);
13760 if (res < 0)
13761 goto err;
13762 }
13763 }
13764 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013765 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013766 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013767
Georg Brandlceee0772007-11-27 23:48:05 +000013768 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013769 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013770 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13771 "to maketrans it must be a dict");
13772 goto err;
13773 }
13774 /* copy entries into the new dict, converting string keys to int keys */
13775 while (PyDict_Next(x, &i, &key, &value)) {
13776 if (PyUnicode_Check(key)) {
13777 /* convert string keys to integer keys */
13778 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013779 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013780 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13781 "table must be of length 1");
13782 goto err;
13783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013784 kind = PyUnicode_KIND(key);
13785 data = PyUnicode_DATA(key);
13786 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013787 if (!newkey)
13788 goto err;
13789 res = PyDict_SetItem(new, newkey, value);
13790 Py_DECREF(newkey);
13791 if (res < 0)
13792 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013793 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013794 /* just keep integer keys */
13795 if (PyDict_SetItem(new, key, value) < 0)
13796 goto err;
13797 } else {
13798 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13799 "be strings or integers");
13800 goto err;
13801 }
13802 }
13803 }
13804 return new;
13805 err:
13806 Py_DECREF(new);
13807 return NULL;
13808}
13809
INADA Naoki3ae20562017-01-16 20:41:20 +090013810/*[clinic input]
13811str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013812
INADA Naoki3ae20562017-01-16 20:41:20 +090013813 table: object
13814 Translation table, which must be a mapping of Unicode ordinals to
13815 Unicode ordinals, strings, or None.
13816 /
13817
13818Replace each character in the string using the given translation table.
13819
13820The table must implement lookup/indexing via __getitem__, for instance a
13821dictionary or list. If this operation raises LookupError, the character is
13822left untouched. Characters mapped to None are deleted.
13823[clinic start generated code]*/
13824
13825static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013826unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013827/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013828{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013829 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013830}
13831
INADA Naoki3ae20562017-01-16 20:41:20 +090013832/*[clinic input]
13833str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013834
INADA Naoki3ae20562017-01-16 20:41:20 +090013835Return a copy of the string converted to uppercase.
13836[clinic start generated code]*/
13837
13838static PyObject *
13839unicode_upper_impl(PyObject *self)
13840/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013841{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013842 if (PyUnicode_READY(self) == -1)
13843 return NULL;
13844 if (PyUnicode_IS_ASCII(self))
13845 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013846 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013847}
13848
INADA Naoki3ae20562017-01-16 20:41:20 +090013849/*[clinic input]
13850str.zfill as unicode_zfill
13851
13852 width: Py_ssize_t
13853 /
13854
13855Pad a numeric string with zeros on the left, to fill a field of the given width.
13856
13857The string is never truncated.
13858[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013859
13860static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013861unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013862/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013863{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013864 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013865 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013866 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013867 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013868 Py_UCS4 chr;
13869
Benjamin Petersonbac79492012-01-14 13:34:47 -050013870 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013871 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013872
Victor Stinnerc4b49542011-12-11 22:44:26 +010013873 if (PyUnicode_GET_LENGTH(self) >= width)
13874 return unicode_result_unchanged(self);
13875
13876 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013877
13878 u = pad(self, fill, 0, '0');
13879
Walter Dörwald068325e2002-04-15 13:36:47 +000013880 if (u == NULL)
13881 return NULL;
13882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013883 kind = PyUnicode_KIND(u);
13884 data = PyUnicode_DATA(u);
13885 chr = PyUnicode_READ(kind, data, fill);
13886
13887 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013888 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013889 PyUnicode_WRITE(kind, data, 0, chr);
13890 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013891 }
13892
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013893 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013894 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013895}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013896
13897#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013898static PyObject *
13899unicode__decimal2ascii(PyObject *self)
13900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013901 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013902}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013903#endif
13904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013905PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013906 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013907\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013908Return True if S starts with the specified prefix, False otherwise.\n\
13909With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013910With optional end, stop comparing S at that position.\n\
13911prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013912
13913static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013914unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013915 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013916{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013917 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013918 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013919 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013920 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013921 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922
Jesus Ceaac451502011-04-20 17:09:23 +020013923 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013924 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013925 if (PyTuple_Check(subobj)) {
13926 Py_ssize_t i;
13927 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013928 substring = PyTuple_GET_ITEM(subobj, i);
13929 if (!PyUnicode_Check(substring)) {
13930 PyErr_Format(PyExc_TypeError,
13931 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013932 "not %.100s",
13933 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013934 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013935 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013936 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013937 if (result == -1)
13938 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013939 if (result) {
13940 Py_RETURN_TRUE;
13941 }
13942 }
13943 /* nothing matched */
13944 Py_RETURN_FALSE;
13945 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013946 if (!PyUnicode_Check(subobj)) {
13947 PyErr_Format(PyExc_TypeError,
13948 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013949 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013950 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013951 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013952 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013953 if (result == -1)
13954 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013955 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013956}
13957
13958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013959PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013960 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013961\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013962Return True if S ends with the specified suffix, False otherwise.\n\
13963With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013964With optional end, stop comparing S at that position.\n\
13965suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013966
13967static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013968unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013969 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013970{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013971 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013972 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013973 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013974 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013975 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013976
Jesus Ceaac451502011-04-20 17:09:23 +020013977 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013978 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013979 if (PyTuple_Check(subobj)) {
13980 Py_ssize_t i;
13981 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013982 substring = PyTuple_GET_ITEM(subobj, i);
13983 if (!PyUnicode_Check(substring)) {
13984 PyErr_Format(PyExc_TypeError,
13985 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013986 "not %.100s",
13987 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013988 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013989 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013990 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013991 if (result == -1)
13992 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013993 if (result) {
13994 Py_RETURN_TRUE;
13995 }
13996 }
13997 Py_RETURN_FALSE;
13998 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013999 if (!PyUnicode_Check(subobj)) {
14000 PyErr_Format(PyExc_TypeError,
14001 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020014002 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000014003 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030014004 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014005 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010014006 if (result == -1)
14007 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014008 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014009}
14010
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014011static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014012_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014013{
Victor Stinnereb36fda2015-10-03 01:55:51 +020014014 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
14015 writer->data = PyUnicode_DATA(writer->buffer);
14016
14017 if (!writer->readonly) {
14018 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020014019 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020014020 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014021 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020014022 /* use a value smaller than PyUnicode_1BYTE_KIND() so
14023 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14024 writer->kind = PyUnicode_WCHAR_KIND;
14025 assert(writer->kind <= PyUnicode_1BYTE_KIND);
14026
Victor Stinner8f674cc2013-04-17 23:02:17 +020014027 /* Copy-on-write mode: set buffer size to 0 so
14028 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
14029 * next write. */
14030 writer->size = 0;
14031 }
Victor Stinner202fdca2012-05-07 12:47:02 +020014032}
14033
Victor Stinnerd3f08822012-05-29 12:57:52 +020014034void
Victor Stinner8f674cc2013-04-17 23:02:17 +020014035_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014036{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014037 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020014038
14039 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020014040 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020014041
14042 /* use a value smaller than PyUnicode_1BYTE_KIND() so
14043 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14044 writer->kind = PyUnicode_WCHAR_KIND;
14045 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020014046}
14047
Inada Naoki770847a2019-06-24 12:30:24 +090014048// Initialize _PyUnicodeWriter with initial buffer
14049static inline void
14050_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
14051{
14052 memset(writer, 0, sizeof(*writer));
14053 writer->buffer = buffer;
14054 _PyUnicodeWriter_Update(writer);
14055 writer->min_length = writer->size;
14056}
14057
Victor Stinnerd3f08822012-05-29 12:57:52 +020014058int
14059_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
14060 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020014061{
14062 Py_ssize_t newlen;
14063 PyObject *newbuffer;
14064
Victor Stinner2740e462016-09-06 16:58:36 -070014065 assert(maxchar <= MAX_UNICODE);
14066
Victor Stinnerca9381e2015-09-22 00:58:32 +020014067 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020014068 assert((maxchar > writer->maxchar && length >= 0)
14069 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014070
Victor Stinner202fdca2012-05-07 12:47:02 +020014071 if (length > PY_SSIZE_T_MAX - writer->pos) {
14072 PyErr_NoMemory();
14073 return -1;
14074 }
14075 newlen = writer->pos + length;
14076
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014077 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020014078
Victor Stinnerd3f08822012-05-29 12:57:52 +020014079 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020014080 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010014081 if (writer->overallocate
14082 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14083 /* overallocate to limit the number of realloc() */
14084 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014085 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014086 if (newlen < writer->min_length)
14087 newlen = writer->min_length;
14088
Victor Stinnerd3f08822012-05-29 12:57:52 +020014089 writer->buffer = PyUnicode_New(newlen, maxchar);
14090 if (writer->buffer == NULL)
14091 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014092 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014093 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010014094 if (writer->overallocate
14095 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14096 /* overallocate to limit the number of realloc() */
14097 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014098 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014099 if (newlen < writer->min_length)
14100 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014101
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014102 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020014103 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030014104 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020014105 newbuffer = PyUnicode_New(newlen, maxchar);
14106 if (newbuffer == NULL)
14107 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014108 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14109 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020014110 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014111 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020014112 }
14113 else {
14114 newbuffer = resize_compact(writer->buffer, newlen);
14115 if (newbuffer == NULL)
14116 return -1;
14117 }
14118 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014119 }
14120 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014121 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014122 newbuffer = PyUnicode_New(writer->size, maxchar);
14123 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014124 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014125 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14126 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014127 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014128 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014129 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014130 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014131
14132#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014133}
14134
Victor Stinnerca9381e2015-09-22 00:58:32 +020014135int
14136_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14137 enum PyUnicode_Kind kind)
14138{
14139 Py_UCS4 maxchar;
14140
14141 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14142 assert(writer->kind < kind);
14143
14144 switch (kind)
14145 {
14146 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14147 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
Victor Stinner99768342021-03-17 21:46:53 +010014148 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
Victor Stinnerca9381e2015-09-22 00:58:32 +020014149 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014150 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014151 }
14152
14153 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14154}
14155
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014156static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014157_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014158{
Victor Stinner2740e462016-09-06 16:58:36 -070014159 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014160 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14161 return -1;
14162 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14163 writer->pos++;
14164 return 0;
14165}
14166
14167int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014168_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14169{
14170 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14171}
14172
14173int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014174_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14175{
14176 Py_UCS4 maxchar;
14177 Py_ssize_t len;
14178
14179 if (PyUnicode_READY(str) == -1)
14180 return -1;
14181 len = PyUnicode_GET_LENGTH(str);
14182 if (len == 0)
14183 return 0;
14184 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14185 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014186 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014187 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014188 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014189 Py_INCREF(str);
14190 writer->buffer = str;
14191 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014192 writer->pos += len;
14193 return 0;
14194 }
14195 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14196 return -1;
14197 }
14198 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14199 str, 0, len);
14200 writer->pos += len;
14201 return 0;
14202}
14203
Victor Stinnere215d962012-10-06 23:03:36 +020014204int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014205_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14206 Py_ssize_t start, Py_ssize_t end)
14207{
14208 Py_UCS4 maxchar;
14209 Py_ssize_t len;
14210
14211 if (PyUnicode_READY(str) == -1)
14212 return -1;
14213
14214 assert(0 <= start);
14215 assert(end <= PyUnicode_GET_LENGTH(str));
14216 assert(start <= end);
14217
14218 if (end == 0)
14219 return 0;
14220
14221 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14222 return _PyUnicodeWriter_WriteStr(writer, str);
14223
14224 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14225 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14226 else
14227 maxchar = writer->maxchar;
14228 len = end - start;
14229
14230 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14231 return -1;
14232
14233 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14234 str, start, len);
14235 writer->pos += len;
14236 return 0;
14237}
14238
14239int
Victor Stinner4a587072013-11-19 12:54:53 +010014240_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14241 const char *ascii, Py_ssize_t len)
14242{
14243 if (len == -1)
14244 len = strlen(ascii);
14245
Andy Lestere6be9b52020-02-11 20:28:35 -060014246 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014247
14248 if (writer->buffer == NULL && !writer->overallocate) {
14249 PyObject *str;
14250
14251 str = _PyUnicode_FromASCII(ascii, len);
14252 if (str == NULL)
14253 return -1;
14254
14255 writer->readonly = 1;
14256 writer->buffer = str;
14257 _PyUnicodeWriter_Update(writer);
14258 writer->pos += len;
14259 return 0;
14260 }
14261
14262 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14263 return -1;
14264
14265 switch (writer->kind)
14266 {
14267 case PyUnicode_1BYTE_KIND:
14268 {
14269 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14270 Py_UCS1 *data = writer->data;
14271
Christian Heimesf051e432016-09-13 20:22:02 +020014272 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014273 break;
14274 }
14275 case PyUnicode_2BYTE_KIND:
14276 {
14277 _PyUnicode_CONVERT_BYTES(
14278 Py_UCS1, Py_UCS2,
14279 ascii, ascii + len,
14280 (Py_UCS2 *)writer->data + writer->pos);
14281 break;
14282 }
14283 case PyUnicode_4BYTE_KIND:
14284 {
14285 _PyUnicode_CONVERT_BYTES(
14286 Py_UCS1, Py_UCS4,
14287 ascii, ascii + len,
14288 (Py_UCS4 *)writer->data + writer->pos);
14289 break;
14290 }
14291 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014292 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014293 }
14294
14295 writer->pos += len;
14296 return 0;
14297}
14298
14299int
14300_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14301 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014302{
14303 Py_UCS4 maxchar;
14304
Andy Lestere6be9b52020-02-11 20:28:35 -060014305 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014306 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14307 return -1;
14308 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14309 writer->pos += len;
14310 return 0;
14311}
14312
Victor Stinnerd3f08822012-05-29 12:57:52 +020014313PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014314_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014315{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014316 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014317
Victor Stinnerd3f08822012-05-29 12:57:52 +020014318 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014319 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014320 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014321 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014322
14323 str = writer->buffer;
14324 writer->buffer = NULL;
14325
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014326 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014327 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14328 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014329 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014330
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014331 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14332 PyObject *str2;
14333 str2 = resize_compact(str, writer->pos);
14334 if (str2 == NULL) {
14335 Py_DECREF(str);
14336 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014337 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014338 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014339 }
14340
Victor Stinner15a0bd32013-07-08 22:29:55 +020014341 assert(_PyUnicode_CheckConsistency(str, 1));
14342 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014343}
14344
Victor Stinnerd3f08822012-05-29 12:57:52 +020014345void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014346_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014347{
14348 Py_CLEAR(writer->buffer);
14349}
14350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014351#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014352
14353PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014354 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014355\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014356Return a formatted version of S, using substitutions from args and kwargs.\n\
14357The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014358
Eric Smith27bbca62010-11-04 17:06:58 +000014359PyDoc_STRVAR(format_map__doc__,
14360 "S.format_map(mapping) -> str\n\
14361\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014362Return a formatted version of S, using substitutions from mapping.\n\
14363The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014364
INADA Naoki3ae20562017-01-16 20:41:20 +090014365/*[clinic input]
14366str.__format__ as unicode___format__
14367
14368 format_spec: unicode
14369 /
14370
14371Return a formatted version of the string as described by format_spec.
14372[clinic start generated code]*/
14373
Eric Smith4a7d76d2008-05-30 18:10:19 +000014374static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014375unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014376/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014377{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014378 _PyUnicodeWriter writer;
14379 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014380
Victor Stinnerd3f08822012-05-29 12:57:52 +020014381 if (PyUnicode_READY(self) == -1)
14382 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014383 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014384 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14385 self, format_spec, 0,
14386 PyUnicode_GET_LENGTH(format_spec));
14387 if (ret == -1) {
14388 _PyUnicodeWriter_Dealloc(&writer);
14389 return NULL;
14390 }
14391 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014392}
14393
INADA Naoki3ae20562017-01-16 20:41:20 +090014394/*[clinic input]
14395str.__sizeof__ as unicode_sizeof
14396
14397Return the size of the string in memory, in bytes.
14398[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014399
14400static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014401unicode_sizeof_impl(PyObject *self)
14402/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014404 Py_ssize_t size;
14405
14406 /* If it's a compact object, account for base structure +
14407 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014408 if (PyUnicode_IS_COMPACT_ASCII(self))
14409 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14410 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014411 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014412 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014413 else {
14414 /* If it is a two-block object, account for base object, and
14415 for character block if present. */
14416 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014417 if (_PyUnicode_DATA_ANY(self))
14418 size += (PyUnicode_GET_LENGTH(self) + 1) *
14419 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014420 }
14421 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014422 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014423 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14424 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14425 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14426 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014427
14428 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014429}
14430
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014431static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014432unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014433{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014434 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014435 if (!copy)
14436 return NULL;
14437 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014438}
14439
Guido van Rossumd57fd912000-03-10 22:53:23 +000014440static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014441 UNICODE_ENCODE_METHODDEF
14442 UNICODE_REPLACE_METHODDEF
14443 UNICODE_SPLIT_METHODDEF
14444 UNICODE_RSPLIT_METHODDEF
14445 UNICODE_JOIN_METHODDEF
14446 UNICODE_CAPITALIZE_METHODDEF
14447 UNICODE_CASEFOLD_METHODDEF
14448 UNICODE_TITLE_METHODDEF
14449 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014450 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014451 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014452 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014453 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014454 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014455 UNICODE_LJUST_METHODDEF
14456 UNICODE_LOWER_METHODDEF
14457 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014458 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14459 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014460 UNICODE_RJUST_METHODDEF
14461 UNICODE_RSTRIP_METHODDEF
14462 UNICODE_RPARTITION_METHODDEF
14463 UNICODE_SPLITLINES_METHODDEF
14464 UNICODE_STRIP_METHODDEF
14465 UNICODE_SWAPCASE_METHODDEF
14466 UNICODE_TRANSLATE_METHODDEF
14467 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014468 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14469 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014470 UNICODE_REMOVEPREFIX_METHODDEF
14471 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014472 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014473 UNICODE_ISLOWER_METHODDEF
14474 UNICODE_ISUPPER_METHODDEF
14475 UNICODE_ISTITLE_METHODDEF
14476 UNICODE_ISSPACE_METHODDEF
14477 UNICODE_ISDECIMAL_METHODDEF
14478 UNICODE_ISDIGIT_METHODDEF
14479 UNICODE_ISNUMERIC_METHODDEF
14480 UNICODE_ISALPHA_METHODDEF
14481 UNICODE_ISALNUM_METHODDEF
14482 UNICODE_ISIDENTIFIER_METHODDEF
14483 UNICODE_ISPRINTABLE_METHODDEF
14484 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014485 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014486 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014487 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014488 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014489 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014490#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014491 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014492 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014493#endif
14494
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014495 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014496 {NULL, NULL}
14497};
14498
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014499static PyObject *
14500unicode_mod(PyObject *v, PyObject *w)
14501{
Brian Curtindfc80e32011-08-10 20:28:54 -050014502 if (!PyUnicode_Check(v))
14503 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014504 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014505}
14506
14507static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014508 0, /*nb_add*/
14509 0, /*nb_subtract*/
14510 0, /*nb_multiply*/
14511 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014512};
14513
Guido van Rossumd57fd912000-03-10 22:53:23 +000014514static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014515 (lenfunc) unicode_length, /* sq_length */
14516 PyUnicode_Concat, /* sq_concat */
14517 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14518 (ssizeargfunc) unicode_getitem, /* sq_item */
14519 0, /* sq_slice */
14520 0, /* sq_ass_item */
14521 0, /* sq_ass_slice */
14522 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014523};
14524
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014525static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014526unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014528 if (PyUnicode_READY(self) == -1)
14529 return NULL;
14530
Victor Stinnera15e2602020-04-08 02:01:56 +020014531 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014532 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014533 if (i == -1 && PyErr_Occurred())
14534 return NULL;
14535 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014536 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014537 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014538 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014539 Py_ssize_t start, stop, step, slicelength, i;
14540 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014541 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014542 const void *src_data;
14543 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014544 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014545 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014546
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014547 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014548 return NULL;
14549 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014550 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14551 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014552
14553 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014554 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014555 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014556 slicelength == PyUnicode_GET_LENGTH(self)) {
14557 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014558 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014559 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014560 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014561 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014562 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014563 src_kind = PyUnicode_KIND(self);
14564 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014565 if (!PyUnicode_IS_ASCII(self)) {
14566 kind_limit = kind_maxchar_limit(src_kind);
14567 max_char = 0;
14568 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14569 ch = PyUnicode_READ(src_kind, src_data, cur);
14570 if (ch > max_char) {
14571 max_char = ch;
14572 if (max_char >= kind_limit)
14573 break;
14574 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014575 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014576 }
Victor Stinner55c99112011-10-13 01:17:06 +020014577 else
14578 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014579 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014580 if (result == NULL)
14581 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014582 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014583 dest_data = PyUnicode_DATA(result);
14584
14585 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014586 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14587 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014588 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014589 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014590 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014591 } else {
14592 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14593 return NULL;
14594 }
14595}
14596
14597static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014598 (lenfunc)unicode_length, /* mp_length */
14599 (binaryfunc)unicode_subscript, /* mp_subscript */
14600 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014601};
14602
Guido van Rossumd57fd912000-03-10 22:53:23 +000014603
Guido van Rossumd57fd912000-03-10 22:53:23 +000014604/* Helpers for PyUnicode_Format() */
14605
Victor Stinnera47082312012-10-04 02:19:54 +020014606struct unicode_formatter_t {
14607 PyObject *args;
14608 int args_owned;
14609 Py_ssize_t arglen, argidx;
14610 PyObject *dict;
14611
14612 enum PyUnicode_Kind fmtkind;
14613 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014614 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014615 PyObject *fmtstr;
14616
14617 _PyUnicodeWriter writer;
14618};
14619
14620struct unicode_format_arg_t {
14621 Py_UCS4 ch;
14622 int flags;
14623 Py_ssize_t width;
14624 int prec;
14625 int sign;
14626};
14627
Guido van Rossumd57fd912000-03-10 22:53:23 +000014628static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014629unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014630{
Victor Stinnera47082312012-10-04 02:19:54 +020014631 Py_ssize_t argidx = ctx->argidx;
14632
14633 if (argidx < ctx->arglen) {
14634 ctx->argidx++;
14635 if (ctx->arglen < 0)
14636 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014637 else
Victor Stinnera47082312012-10-04 02:19:54 +020014638 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014639 }
14640 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014641 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014642 return NULL;
14643}
14644
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014645/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014646
Victor Stinnera47082312012-10-04 02:19:54 +020014647/* Format a float into the writer if the writer is not NULL, or into *p_output
14648 otherwise.
14649
14650 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014651static int
Victor Stinnera47082312012-10-04 02:19:54 +020014652formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14653 PyObject **p_output,
14654 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014655{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014656 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014657 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014658 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014659 int prec;
14660 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014661
Guido van Rossumd57fd912000-03-10 22:53:23 +000014662 x = PyFloat_AsDouble(v);
14663 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014664 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014665
Victor Stinnera47082312012-10-04 02:19:54 +020014666 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014667 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014668 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014669
Victor Stinnera47082312012-10-04 02:19:54 +020014670 if (arg->flags & F_ALT)
14671 dtoa_flags = Py_DTSF_ALT;
14672 else
14673 dtoa_flags = 0;
14674 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014675 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014676 return -1;
14677 len = strlen(p);
14678 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014679 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014680 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014681 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014682 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014683 }
14684 else
14685 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014686 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014687 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014688}
14689
Victor Stinnerd0880d52012-04-27 23:40:13 +020014690/* formatlong() emulates the format codes d, u, o, x and X, and
14691 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14692 * Python's regular ints.
14693 * Return value: a new PyUnicodeObject*, or NULL if error.
14694 * The output string is of the form
14695 * "-"? ("0x" | "0X")? digit+
14696 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14697 * set in flags. The case of hex digits will be correct,
14698 * There will be at least prec digits, zero-filled on the left if
14699 * necessary to get that many.
14700 * val object to be converted
14701 * flags bitmask of format flags; only F_ALT is looked at
14702 * prec minimum number of digits; 0-fill on left if needed
14703 * type a character in [duoxX]; u acts the same as d
14704 *
14705 * CAUTION: o, x and X conversions on regular ints can never
14706 * produce a '-' sign, but can for Python's unbounded ints.
14707 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014708PyObject *
14709_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014710{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014711 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014712 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014713 Py_ssize_t i;
14714 int sign; /* 1 if '-', else 0 */
14715 int len; /* number of characters */
14716 Py_ssize_t llen;
14717 int numdigits; /* len == numnondigits + numdigits */
14718 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014719
Victor Stinnerd0880d52012-04-27 23:40:13 +020014720 /* Avoid exceeding SSIZE_T_MAX */
14721 if (prec > INT_MAX-3) {
14722 PyErr_SetString(PyExc_OverflowError,
14723 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014724 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014725 }
14726
14727 assert(PyLong_Check(val));
14728
14729 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014730 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014731 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014732 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014733 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014734 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014735 /* int and int subclasses should print numerically when a numeric */
14736 /* format code is used (see issue18780) */
14737 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014738 break;
14739 case 'o':
14740 numnondigits = 2;
14741 result = PyNumber_ToBase(val, 8);
14742 break;
14743 case 'x':
14744 case 'X':
14745 numnondigits = 2;
14746 result = PyNumber_ToBase(val, 16);
14747 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014748 }
14749 if (!result)
14750 return NULL;
14751
14752 assert(unicode_modifiable(result));
14753 assert(PyUnicode_IS_READY(result));
14754 assert(PyUnicode_IS_ASCII(result));
14755
14756 /* To modify the string in-place, there can only be one reference. */
14757 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014758 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014759 PyErr_BadInternalCall();
14760 return NULL;
14761 }
14762 buf = PyUnicode_DATA(result);
14763 llen = PyUnicode_GET_LENGTH(result);
14764 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014765 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014766 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014767 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014768 return NULL;
14769 }
14770 len = (int)llen;
14771 sign = buf[0] == '-';
14772 numnondigits += sign;
14773 numdigits = len - numnondigits;
14774 assert(numdigits > 0);
14775
14776 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014777 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014778 (type == 'o' || type == 'x' || type == 'X'))) {
14779 assert(buf[sign] == '0');
14780 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14781 buf[sign+1] == 'o');
14782 numnondigits -= 2;
14783 buf += 2;
14784 len -= 2;
14785 if (sign)
14786 buf[0] = '-';
14787 assert(len == numnondigits + numdigits);
14788 assert(numdigits > 0);
14789 }
14790
14791 /* Fill with leading zeroes to meet minimum width. */
14792 if (prec > numdigits) {
14793 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14794 numnondigits + prec);
14795 char *b1;
14796 if (!r1) {
14797 Py_DECREF(result);
14798 return NULL;
14799 }
14800 b1 = PyBytes_AS_STRING(r1);
14801 for (i = 0; i < numnondigits; ++i)
14802 *b1++ = *buf++;
14803 for (i = 0; i < prec - numdigits; i++)
14804 *b1++ = '0';
14805 for (i = 0; i < numdigits; i++)
14806 *b1++ = *buf++;
14807 *b1 = '\0';
14808 Py_DECREF(result);
14809 result = r1;
14810 buf = PyBytes_AS_STRING(result);
14811 len = numnondigits + prec;
14812 }
14813
14814 /* Fix up case for hex conversions. */
14815 if (type == 'X') {
14816 /* Need to convert all lower case letters to upper case.
14817 and need to convert 0x to 0X (and -0x to -0X). */
14818 for (i = 0; i < len; i++)
14819 if (buf[i] >= 'a' && buf[i] <= 'x')
14820 buf[i] -= 'a'-'A';
14821 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014822 if (!PyUnicode_Check(result)
14823 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014824 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014825 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014826 Py_DECREF(result);
14827 result = unicode;
14828 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014829 else if (len != PyUnicode_GET_LENGTH(result)) {
14830 if (PyUnicode_Resize(&result, len) < 0)
14831 Py_CLEAR(result);
14832 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014833 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014834}
14835
Ethan Furmandf3ed242014-01-05 06:50:30 -080014836/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014837 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014838 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014839 * -1 and raise an exception on error */
14840static int
Victor Stinnera47082312012-10-04 02:19:54 +020014841mainformatlong(PyObject *v,
14842 struct unicode_format_arg_t *arg,
14843 PyObject **p_output,
14844 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014845{
14846 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014847 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014848
14849 if (!PyNumber_Check(v))
14850 goto wrongtype;
14851
Ethan Furman9ab74802014-03-21 06:38:46 -070014852 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014853 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014854 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014855 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014856 }
14857 else {
14858 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014859 }
14860 if (iobj == NULL ) {
14861 if (PyErr_ExceptionMatches(PyExc_TypeError))
14862 goto wrongtype;
14863 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014864 }
14865 assert(PyLong_Check(iobj));
14866 }
14867 else {
14868 iobj = v;
14869 Py_INCREF(iobj);
14870 }
14871
14872 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014873 && arg->width == -1 && arg->prec == -1
14874 && !(arg->flags & (F_SIGN | F_BLANK))
14875 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014876 {
14877 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014878 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014879 int base;
14880
Victor Stinnera47082312012-10-04 02:19:54 +020014881 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014882 {
14883 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014884 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014885 case 'd':
14886 case 'i':
14887 case 'u':
14888 base = 10;
14889 break;
14890 case 'o':
14891 base = 8;
14892 break;
14893 case 'x':
14894 case 'X':
14895 base = 16;
14896 break;
14897 }
14898
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014899 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14900 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014901 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014902 }
14903 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014904 return 1;
14905 }
14906
Ethan Furmanb95b5612015-01-23 20:05:18 -080014907 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014908 Py_DECREF(iobj);
14909 if (res == NULL)
14910 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014911 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014912 return 0;
14913
14914wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014915 switch(type)
14916 {
14917 case 'o':
14918 case 'x':
14919 case 'X':
14920 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014921 "%%%c format: an integer is required, "
14922 "not %.200s",
14923 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014924 break;
14925 default:
14926 PyErr_Format(PyExc_TypeError,
Serhiy Storchakae2ec0b22020-10-09 14:14:37 +030014927 "%%%c format: a real number is required, "
Victor Stinner998b8062018-09-12 00:23:25 +020014928 "not %.200s",
14929 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014930 break;
14931 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014932 return -1;
14933}
14934
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014935static Py_UCS4
14936formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014937{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014938 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014939 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014940 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014941 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014942 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014943 goto onError;
14944 }
14945 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014946 int overflow;
14947 long x = PyLong_AsLongAndOverflow(v, &overflow);
14948 if (x == -1 && PyErr_Occurred()) {
14949 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014950 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014951 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014952 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014953 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014954
Victor Stinner8faf8212011-12-08 22:14:11 +010014955 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014956 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014957 PyErr_SetString(PyExc_OverflowError,
14958 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014959 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014960 }
14961
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014962 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014963 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014964
Benjamin Peterson29060642009-01-31 22:14:21 +000014965 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014966 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014967 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014968 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014969}
14970
Victor Stinnera47082312012-10-04 02:19:54 +020014971/* Parse options of an argument: flags, width, precision.
14972 Handle also "%(name)" syntax.
14973
14974 Return 0 if the argument has been formatted into arg->str.
14975 Return 1 if the argument has been written into ctx->writer,
14976 Raise an exception and return -1 on error. */
14977static int
14978unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14979 struct unicode_format_arg_t *arg)
14980{
14981#define FORMAT_READ(ctx) \
14982 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14983
14984 PyObject *v;
14985
Victor Stinnera47082312012-10-04 02:19:54 +020014986 if (arg->ch == '(') {
14987 /* Get argument value from a dictionary. Example: "%(name)s". */
14988 Py_ssize_t keystart;
14989 Py_ssize_t keylen;
14990 PyObject *key;
14991 int pcount = 1;
14992
14993 if (ctx->dict == NULL) {
14994 PyErr_SetString(PyExc_TypeError,
14995 "format requires a mapping");
14996 return -1;
14997 }
14998 ++ctx->fmtpos;
14999 --ctx->fmtcnt;
15000 keystart = ctx->fmtpos;
15001 /* Skip over balanced parentheses */
15002 while (pcount > 0 && --ctx->fmtcnt >= 0) {
15003 arg->ch = FORMAT_READ(ctx);
15004 if (arg->ch == ')')
15005 --pcount;
15006 else if (arg->ch == '(')
15007 ++pcount;
15008 ctx->fmtpos++;
15009 }
15010 keylen = ctx->fmtpos - keystart - 1;
15011 if (ctx->fmtcnt < 0 || pcount > 0) {
15012 PyErr_SetString(PyExc_ValueError,
15013 "incomplete format key");
15014 return -1;
15015 }
15016 key = PyUnicode_Substring(ctx->fmtstr,
15017 keystart, keystart + keylen);
15018 if (key == NULL)
15019 return -1;
15020 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020015021 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020015022 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020015023 }
15024 ctx->args = PyObject_GetItem(ctx->dict, key);
15025 Py_DECREF(key);
15026 if (ctx->args == NULL)
15027 return -1;
15028 ctx->args_owned = 1;
15029 ctx->arglen = -1;
15030 ctx->argidx = -2;
15031 }
15032
15033 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020015034 while (--ctx->fmtcnt >= 0) {
15035 arg->ch = FORMAT_READ(ctx);
15036 ctx->fmtpos++;
15037 switch (arg->ch) {
15038 case '-': arg->flags |= F_LJUST; continue;
15039 case '+': arg->flags |= F_SIGN; continue;
15040 case ' ': arg->flags |= F_BLANK; continue;
15041 case '#': arg->flags |= F_ALT; continue;
15042 case '0': arg->flags |= F_ZERO; continue;
15043 }
15044 break;
15045 }
15046
15047 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020015048 if (arg->ch == '*') {
15049 v = unicode_format_getnextarg(ctx);
15050 if (v == NULL)
15051 return -1;
15052 if (!PyLong_Check(v)) {
15053 PyErr_SetString(PyExc_TypeError,
15054 "* wants int");
15055 return -1;
15056 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015057 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015058 if (arg->width == -1 && PyErr_Occurred())
15059 return -1;
15060 if (arg->width < 0) {
15061 arg->flags |= F_LJUST;
15062 arg->width = -arg->width;
15063 }
15064 if (--ctx->fmtcnt >= 0) {
15065 arg->ch = FORMAT_READ(ctx);
15066 ctx->fmtpos++;
15067 }
15068 }
15069 else if (arg->ch >= '0' && arg->ch <= '9') {
15070 arg->width = arg->ch - '0';
15071 while (--ctx->fmtcnt >= 0) {
15072 arg->ch = FORMAT_READ(ctx);
15073 ctx->fmtpos++;
15074 if (arg->ch < '0' || arg->ch > '9')
15075 break;
15076 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15077 mixing signed and unsigned comparison. Since arg->ch is between
15078 '0' and '9', casting to int is safe. */
15079 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15080 PyErr_SetString(PyExc_ValueError,
15081 "width too big");
15082 return -1;
15083 }
15084 arg->width = arg->width*10 + (arg->ch - '0');
15085 }
15086 }
15087
15088 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020015089 if (arg->ch == '.') {
15090 arg->prec = 0;
15091 if (--ctx->fmtcnt >= 0) {
15092 arg->ch = FORMAT_READ(ctx);
15093 ctx->fmtpos++;
15094 }
15095 if (arg->ch == '*') {
15096 v = unicode_format_getnextarg(ctx);
15097 if (v == NULL)
15098 return -1;
15099 if (!PyLong_Check(v)) {
15100 PyErr_SetString(PyExc_TypeError,
15101 "* wants int");
15102 return -1;
15103 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015104 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015105 if (arg->prec == -1 && PyErr_Occurred())
15106 return -1;
15107 if (arg->prec < 0)
15108 arg->prec = 0;
15109 if (--ctx->fmtcnt >= 0) {
15110 arg->ch = FORMAT_READ(ctx);
15111 ctx->fmtpos++;
15112 }
15113 }
15114 else if (arg->ch >= '0' && arg->ch <= '9') {
15115 arg->prec = arg->ch - '0';
15116 while (--ctx->fmtcnt >= 0) {
15117 arg->ch = FORMAT_READ(ctx);
15118 ctx->fmtpos++;
15119 if (arg->ch < '0' || arg->ch > '9')
15120 break;
15121 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15122 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015123 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015124 return -1;
15125 }
15126 arg->prec = arg->prec*10 + (arg->ch - '0');
15127 }
15128 }
15129 }
15130
15131 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15132 if (ctx->fmtcnt >= 0) {
15133 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15134 if (--ctx->fmtcnt >= 0) {
15135 arg->ch = FORMAT_READ(ctx);
15136 ctx->fmtpos++;
15137 }
15138 }
15139 }
15140 if (ctx->fmtcnt < 0) {
15141 PyErr_SetString(PyExc_ValueError,
15142 "incomplete format");
15143 return -1;
15144 }
15145 return 0;
15146
15147#undef FORMAT_READ
15148}
15149
15150/* Format one argument. Supported conversion specifiers:
15151
15152 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015153 - "i", "d", "u": int or float
15154 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015155 - "e", "E", "f", "F", "g", "G": float
15156 - "c": int or str (1 character)
15157
Victor Stinner8dbd4212012-12-04 09:30:24 +010015158 When possible, the output is written directly into the Unicode writer
15159 (ctx->writer). A string is created when padding is required.
15160
Victor Stinnera47082312012-10-04 02:19:54 +020015161 Return 0 if the argument has been formatted into *p_str,
15162 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015163 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015164static int
15165unicode_format_arg_format(struct unicode_formatter_t *ctx,
15166 struct unicode_format_arg_t *arg,
15167 PyObject **p_str)
15168{
15169 PyObject *v;
15170 _PyUnicodeWriter *writer = &ctx->writer;
15171
15172 if (ctx->fmtcnt == 0)
15173 ctx->writer.overallocate = 0;
15174
Victor Stinnera47082312012-10-04 02:19:54 +020015175 v = unicode_format_getnextarg(ctx);
15176 if (v == NULL)
15177 return -1;
15178
Victor Stinnera47082312012-10-04 02:19:54 +020015179
15180 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015181 case 's':
15182 case 'r':
15183 case 'a':
15184 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15185 /* Fast path */
15186 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15187 return -1;
15188 return 1;
15189 }
15190
15191 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15192 *p_str = v;
15193 Py_INCREF(*p_str);
15194 }
15195 else {
15196 if (arg->ch == 's')
15197 *p_str = PyObject_Str(v);
15198 else if (arg->ch == 'r')
15199 *p_str = PyObject_Repr(v);
15200 else
15201 *p_str = PyObject_ASCII(v);
15202 }
15203 break;
15204
15205 case 'i':
15206 case 'd':
15207 case 'u':
15208 case 'o':
15209 case 'x':
15210 case 'X':
15211 {
15212 int ret = mainformatlong(v, arg, p_str, writer);
15213 if (ret != 0)
15214 return ret;
15215 arg->sign = 1;
15216 break;
15217 }
15218
15219 case 'e':
15220 case 'E':
15221 case 'f':
15222 case 'F':
15223 case 'g':
15224 case 'G':
15225 if (arg->width == -1 && arg->prec == -1
15226 && !(arg->flags & (F_SIGN | F_BLANK)))
15227 {
15228 /* Fast path */
15229 if (formatfloat(v, arg, NULL, writer) == -1)
15230 return -1;
15231 return 1;
15232 }
15233
15234 arg->sign = 1;
15235 if (formatfloat(v, arg, p_str, NULL) == -1)
15236 return -1;
15237 break;
15238
15239 case 'c':
15240 {
15241 Py_UCS4 ch = formatchar(v);
15242 if (ch == (Py_UCS4) -1)
15243 return -1;
15244 if (arg->width == -1 && arg->prec == -1) {
15245 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015246 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015247 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015248 return 1;
15249 }
15250 *p_str = PyUnicode_FromOrdinal(ch);
15251 break;
15252 }
15253
15254 default:
15255 PyErr_Format(PyExc_ValueError,
15256 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015257 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015258 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15259 (int)arg->ch,
15260 ctx->fmtpos - 1);
15261 return -1;
15262 }
15263 if (*p_str == NULL)
15264 return -1;
15265 assert (PyUnicode_Check(*p_str));
15266 return 0;
15267}
15268
15269static int
15270unicode_format_arg_output(struct unicode_formatter_t *ctx,
15271 struct unicode_format_arg_t *arg,
15272 PyObject *str)
15273{
15274 Py_ssize_t len;
15275 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015276 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015277 Py_ssize_t pindex;
15278 Py_UCS4 signchar;
15279 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015280 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015281 Py_ssize_t sublen;
15282 _PyUnicodeWriter *writer = &ctx->writer;
15283 Py_UCS4 fill;
15284
15285 fill = ' ';
15286 if (arg->sign && arg->flags & F_ZERO)
15287 fill = '0';
15288
15289 if (PyUnicode_READY(str) == -1)
15290 return -1;
15291
15292 len = PyUnicode_GET_LENGTH(str);
15293 if ((arg->width == -1 || arg->width <= len)
15294 && (arg->prec == -1 || arg->prec >= len)
15295 && !(arg->flags & (F_SIGN | F_BLANK)))
15296 {
15297 /* Fast path */
15298 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15299 return -1;
15300 return 0;
15301 }
15302
15303 /* Truncate the string for "s", "r" and "a" formats
15304 if the precision is set */
15305 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15306 if (arg->prec >= 0 && len > arg->prec)
15307 len = arg->prec;
15308 }
15309
15310 /* Adjust sign and width */
15311 kind = PyUnicode_KIND(str);
15312 pbuf = PyUnicode_DATA(str);
15313 pindex = 0;
15314 signchar = '\0';
15315 if (arg->sign) {
15316 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15317 if (ch == '-' || ch == '+') {
15318 signchar = ch;
15319 len--;
15320 pindex++;
15321 }
15322 else if (arg->flags & F_SIGN)
15323 signchar = '+';
15324 else if (arg->flags & F_BLANK)
15325 signchar = ' ';
15326 else
15327 arg->sign = 0;
15328 }
15329 if (arg->width < len)
15330 arg->width = len;
15331
15332 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015333 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015334 if (!(arg->flags & F_LJUST)) {
15335 if (arg->sign) {
15336 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015337 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015338 }
15339 else {
15340 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015341 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015342 }
15343 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015344 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15345 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015346 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015347 }
15348
Victor Stinnera47082312012-10-04 02:19:54 +020015349 buflen = arg->width;
15350 if (arg->sign && len == arg->width)
15351 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015352 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015353 return -1;
15354
15355 /* Write the sign if needed */
15356 if (arg->sign) {
15357 if (fill != ' ') {
15358 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15359 writer->pos += 1;
15360 }
15361 if (arg->width > len)
15362 arg->width--;
15363 }
15364
15365 /* Write the numeric prefix for "x", "X" and "o" formats
15366 if the alternate form is used.
15367 For example, write "0x" for the "%#x" format. */
15368 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15369 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15370 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15371 if (fill != ' ') {
15372 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15373 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15374 writer->pos += 2;
15375 pindex += 2;
15376 }
15377 arg->width -= 2;
15378 if (arg->width < 0)
15379 arg->width = 0;
15380 len -= 2;
15381 }
15382
15383 /* Pad left with the fill character if needed */
15384 if (arg->width > len && !(arg->flags & F_LJUST)) {
15385 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015386 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015387 writer->pos += sublen;
15388 arg->width = len;
15389 }
15390
15391 /* If padding with spaces: write sign if needed and/or numeric prefix if
15392 the alternate form is used */
15393 if (fill == ' ') {
15394 if (arg->sign) {
15395 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15396 writer->pos += 1;
15397 }
15398 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15399 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15400 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15401 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15402 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15403 writer->pos += 2;
15404 pindex += 2;
15405 }
15406 }
15407
15408 /* Write characters */
15409 if (len) {
15410 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15411 str, pindex, len);
15412 writer->pos += len;
15413 }
15414
15415 /* Pad right with the fill character if needed */
15416 if (arg->width > len) {
15417 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015418 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015419 writer->pos += sublen;
15420 }
15421 return 0;
15422}
15423
15424/* Helper of PyUnicode_Format(): format one arg.
15425 Return 0 on success, raise an exception and return -1 on error. */
15426static int
15427unicode_format_arg(struct unicode_formatter_t *ctx)
15428{
15429 struct unicode_format_arg_t arg;
15430 PyObject *str;
15431 int ret;
15432
Victor Stinner8dbd4212012-12-04 09:30:24 +010015433 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015434 if (arg.ch == '%') {
15435 ctx->fmtpos++;
15436 ctx->fmtcnt--;
15437 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15438 return -1;
15439 return 0;
15440 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015441 arg.flags = 0;
15442 arg.width = -1;
15443 arg.prec = -1;
15444 arg.sign = 0;
15445 str = NULL;
15446
Victor Stinnera47082312012-10-04 02:19:54 +020015447 ret = unicode_format_arg_parse(ctx, &arg);
15448 if (ret == -1)
15449 return -1;
15450
15451 ret = unicode_format_arg_format(ctx, &arg, &str);
15452 if (ret == -1)
15453 return -1;
15454
15455 if (ret != 1) {
15456 ret = unicode_format_arg_output(ctx, &arg, str);
15457 Py_DECREF(str);
15458 if (ret == -1)
15459 return -1;
15460 }
15461
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015462 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015463 PyErr_SetString(PyExc_TypeError,
15464 "not all arguments converted during string formatting");
15465 return -1;
15466 }
15467 return 0;
15468}
15469
Alexander Belopolsky40018472011-02-26 01:02:56 +000015470PyObject *
15471PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015472{
Victor Stinnera47082312012-10-04 02:19:54 +020015473 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015474
Guido van Rossumd57fd912000-03-10 22:53:23 +000015475 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015476 PyErr_BadInternalCall();
15477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015478 }
Victor Stinnera47082312012-10-04 02:19:54 +020015479
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015480 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015481 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015482
15483 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015484 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15485 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15486 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15487 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015488
Victor Stinner8f674cc2013-04-17 23:02:17 +020015489 _PyUnicodeWriter_Init(&ctx.writer);
15490 ctx.writer.min_length = ctx.fmtcnt + 100;
15491 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015492
Guido van Rossumd57fd912000-03-10 22:53:23 +000015493 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015494 ctx.arglen = PyTuple_Size(args);
15495 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015496 }
15497 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015498 ctx.arglen = -1;
15499 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015500 }
Victor Stinnera47082312012-10-04 02:19:54 +020015501 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015502 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015503 ctx.dict = args;
15504 else
15505 ctx.dict = NULL;
15506 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015507
Victor Stinnera47082312012-10-04 02:19:54 +020015508 while (--ctx.fmtcnt >= 0) {
15509 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015510 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015511
15512 nonfmtpos = ctx.fmtpos++;
15513 while (ctx.fmtcnt >= 0 &&
15514 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15515 ctx.fmtpos++;
15516 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015517 }
Victor Stinnera47082312012-10-04 02:19:54 +020015518 if (ctx.fmtcnt < 0) {
15519 ctx.fmtpos--;
15520 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015521 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015522
Victor Stinnercfc4c132013-04-03 01:48:39 +020015523 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15524 nonfmtpos, ctx.fmtpos) < 0)
15525 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015526 }
15527 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015528 ctx.fmtpos++;
15529 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015530 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015531 }
15532 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015533
Victor Stinnera47082312012-10-04 02:19:54 +020015534 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015535 PyErr_SetString(PyExc_TypeError,
15536 "not all arguments converted during string formatting");
15537 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015538 }
15539
Victor Stinnera47082312012-10-04 02:19:54 +020015540 if (ctx.args_owned) {
15541 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015542 }
Victor Stinnera47082312012-10-04 02:19:54 +020015543 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015544
Benjamin Peterson29060642009-01-31 22:14:21 +000015545 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015546 _PyUnicodeWriter_Dealloc(&ctx.writer);
15547 if (ctx.args_owned) {
15548 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015549 }
15550 return NULL;
15551}
15552
Jeremy Hylton938ace62002-07-17 16:30:39 +000015553static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015554unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15555
15556/*[clinic input]
15557@classmethod
15558str.__new__ as unicode_new
15559
15560 object as x: object = NULL
15561 encoding: str = NULL
15562 errors: str = NULL
15563
15564[clinic start generated code]*/
Guido van Rossume023fe02001-08-30 03:12:59 +000015565
Tim Peters6d6c1a32001-08-02 04:15:00 +000015566static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015567unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15568 const char *errors)
15569/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
Tim Peters6d6c1a32001-08-02 04:15:00 +000015570{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015571 PyObject *unicode;
15572 if (x == NULL) {
15573 unicode = unicode_new_empty();
15574 }
15575 else if (encoding == NULL && errors == NULL) {
15576 unicode = PyObject_Str(x);
15577 }
15578 else {
15579 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15580 }
Tim Peters6d6c1a32001-08-02 04:15:00 +000015581
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015582 if (unicode != NULL && type != &PyUnicode_Type) {
15583 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15584 }
15585 return unicode;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015586}
15587
Guido van Rossume023fe02001-08-30 03:12:59 +000015588static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015589unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
Guido van Rossume023fe02001-08-30 03:12:59 +000015590{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015591 PyObject *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015592 Py_ssize_t length, char_size;
15593 int share_wstr, share_utf8;
15594 unsigned int kind;
15595 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015596
Benjamin Peterson14339b62009-01-31 16:36:08 +000015597 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner910337b2011-10-03 03:20:16 +020015598 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015599 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015600 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015601 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015602
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015603 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015604 if (self == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015605 return NULL;
15606 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015607 kind = PyUnicode_KIND(unicode);
15608 length = PyUnicode_GET_LENGTH(unicode);
15609
15610 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015611#ifdef Py_DEBUG
15612 _PyUnicode_HASH(self) = -1;
15613#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015614 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015615#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015616 _PyUnicode_STATE(self).interned = 0;
15617 _PyUnicode_STATE(self).kind = kind;
15618 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015619 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015620 _PyUnicode_STATE(self).ready = 1;
15621 _PyUnicode_WSTR(self) = NULL;
15622 _PyUnicode_UTF8_LENGTH(self) = 0;
15623 _PyUnicode_UTF8(self) = NULL;
15624 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015625 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015626
15627 share_utf8 = 0;
15628 share_wstr = 0;
15629 if (kind == PyUnicode_1BYTE_KIND) {
15630 char_size = 1;
15631 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15632 share_utf8 = 1;
15633 }
15634 else if (kind == PyUnicode_2BYTE_KIND) {
15635 char_size = 2;
15636 if (sizeof(wchar_t) == 2)
15637 share_wstr = 1;
15638 }
15639 else {
15640 assert(kind == PyUnicode_4BYTE_KIND);
15641 char_size = 4;
15642 if (sizeof(wchar_t) == 4)
15643 share_wstr = 1;
15644 }
15645
15646 /* Ensure we won't overflow the length. */
15647 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15648 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015649 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015650 }
Victor Stinner32bd68c2020-12-01 10:37:39 +010015651 data = PyObject_Malloc((length + 1) * char_size);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015652 if (data == NULL) {
15653 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015654 goto onError;
15655 }
15656
Victor Stinnerc3c74152011-10-02 20:39:55 +020015657 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015658 if (share_utf8) {
15659 _PyUnicode_UTF8_LENGTH(self) = length;
15660 _PyUnicode_UTF8(self) = data;
15661 }
15662 if (share_wstr) {
15663 _PyUnicode_WSTR_LENGTH(self) = length;
15664 _PyUnicode_WSTR(self) = (wchar_t *)data;
15665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015666
Christian Heimesf051e432016-09-13 20:22:02 +020015667 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015668 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015669 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015670#ifdef Py_DEBUG
15671 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15672#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +010015673 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015674
15675onError:
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015676 Py_DECREF(self);
15677 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015678}
15679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015680PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015681"str(object='') -> str\n\
15682str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015683\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015684Create a new string object from the given object. If encoding or\n\
15685errors is specified, then the object must expose a data buffer\n\
15686that will be decoded using the given encoding and error handler.\n\
15687Otherwise, returns the result of object.__str__() (if defined)\n\
15688or repr(object).\n\
15689encoding defaults to sys.getdefaultencoding().\n\
15690errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015691
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015692static PyObject *unicode_iter(PyObject *seq);
15693
Guido van Rossumd57fd912000-03-10 22:53:23 +000015694PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015695 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015696 "str", /* tp_name */
15697 sizeof(PyUnicodeObject), /* tp_basicsize */
15698 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015699 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015700 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015701 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015702 0, /* tp_getattr */
15703 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015704 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015705 unicode_repr, /* tp_repr */
15706 &unicode_as_number, /* tp_as_number */
15707 &unicode_as_sequence, /* tp_as_sequence */
15708 &unicode_as_mapping, /* tp_as_mapping */
15709 (hashfunc) unicode_hash, /* tp_hash*/
15710 0, /* tp_call*/
15711 (reprfunc) unicode_str, /* tp_str */
15712 PyObject_GenericGetAttr, /* tp_getattro */
15713 0, /* tp_setattro */
15714 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015715 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Brandt Bucher145bf262021-02-26 14:51:55 -080015716 Py_TPFLAGS_UNICODE_SUBCLASS |
15717 _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
Bupfc93bd42018-06-19 03:59:55 -050015718 unicode_doc, /* tp_doc */
15719 0, /* tp_traverse */
15720 0, /* tp_clear */
15721 PyUnicode_RichCompare, /* tp_richcompare */
15722 0, /* tp_weaklistoffset */
15723 unicode_iter, /* tp_iter */
15724 0, /* tp_iternext */
15725 unicode_methods, /* tp_methods */
15726 0, /* tp_members */
15727 0, /* tp_getset */
15728 &PyBaseObject_Type, /* tp_base */
15729 0, /* tp_dict */
15730 0, /* tp_descr_get */
15731 0, /* tp_descr_set */
15732 0, /* tp_dictoffset */
15733 0, /* tp_init */
15734 0, /* tp_alloc */
15735 unicode_new, /* tp_new */
15736 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015737};
15738
15739/* Initialize the Unicode implementation */
15740
Victor Stinner331a6a52019-05-27 16:39:22 +020015741PyStatus
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015742_PyUnicode_Init(PyInterpreterState *interp)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015743{
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015744 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinner91698d82020-06-25 14:07:40 +020015745 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015746 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015747 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015748
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015749 if (_Py_IsMainInterpreter(interp)) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015750 /* initialize the linebreak bloom filter */
Victor Stinner442ad742021-04-02 15:28:13 +020015751 const Py_UCS2 linebreak[] = {
15752 0x000A, /* LINE FEED */
15753 0x000D, /* CARRIAGE RETURN */
15754 0x001C, /* FILE SEPARATOR */
15755 0x001D, /* GROUP SEPARATOR */
15756 0x001E, /* RECORD SEPARATOR */
15757 0x0085, /* NEXT LINE */
15758 0x2028, /* LINE SEPARATOR */
15759 0x2029, /* PARAGRAPH SEPARATOR */
15760 };
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015761 bloom_linebreak = make_bloom_mask(
15762 PyUnicode_2BYTE_KIND, linebreak,
15763 Py_ARRAY_LENGTH(linebreak));
Victor Stinner442ad742021-04-02 15:28:13 +020015764 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015765
Victor Stinner442ad742021-04-02 15:28:13 +020015766 return _PyStatus_OK();
15767}
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015768
Victor Stinner442ad742021-04-02 15:28:13 +020015769
15770PyStatus
15771_PyUnicode_InitTypes(void)
15772{
15773 if (PyType_Ready(&PyUnicode_Type) < 0) {
15774 return _PyStatus_ERR("Can't initialize unicode type");
15775 }
15776 if (PyType_Ready(&EncodingMapType) < 0) {
15777 return _PyStatus_ERR("Can't initialize encoding map type");
15778 }
15779 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15780 return _PyStatus_ERR("Can't initialize field name iterator type");
15781 }
15782 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15783 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015784 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015785 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015786}
15787
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015788
Walter Dörwald16807132007-05-25 13:52:07 +000015789void
15790PyUnicode_InternInPlace(PyObject **p)
15791{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015792 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015793#ifdef Py_DEBUG
15794 assert(s != NULL);
15795 assert(_PyUnicode_CHECK(s));
15796#else
Victor Stinner607b1022020-05-05 18:50:30 +020015797 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015798 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015799 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015800#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015801
Benjamin Peterson14339b62009-01-31 16:36:08 +000015802 /* If it's a subclass, we don't really know what putting
15803 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015804 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015805 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015806 }
15807
15808 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015809 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015810 }
15811
Victor Stinner666ecfb2020-07-02 01:19:57 +020015812 if (PyUnicode_READY(s) == -1) {
15813 PyErr_Clear();
15814 return;
15815 }
15816
Victor Stinnerea251802020-12-26 02:58:33 +010015817 struct _Py_unicode_state *state = get_unicode_state();
15818 if (state->interned == NULL) {
15819 state->interned = PyDict_New();
15820 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015821 PyErr_Clear(); /* Don't leave an exception */
15822 return;
15823 }
15824 }
Victor Stinner607b1022020-05-05 18:50:30 +020015825
Victor Stinnerea251802020-12-26 02:58:33 +010015826 PyObject *t = PyDict_SetDefault(state->interned, s, s);
Berker Peksagced8d4c2016-07-25 04:40:39 +030015827 if (t == NULL) {
15828 PyErr_Clear();
15829 return;
15830 }
Victor Stinner607b1022020-05-05 18:50:30 +020015831
Berker Peksagced8d4c2016-07-25 04:40:39 +030015832 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015833 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015834 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015835 return;
15836 }
Victor Stinner607b1022020-05-05 18:50:30 +020015837
Victor Stinner3549ca32020-07-03 16:59:12 +020015838 /* The two references in interned dict (key and value) are not counted by
15839 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15840 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015841 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015842 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015843}
15844
Victor Stinnerea251802020-12-26 02:58:33 +010015845
Walter Dörwald16807132007-05-25 13:52:07 +000015846void
15847PyUnicode_InternImmortal(PyObject **p)
15848{
Victor Stinner583ee5a2020-10-02 14:49:00 +020015849 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15850 "PyUnicode_InternImmortal() is deprecated; "
15851 "use PyUnicode_InternInPlace() instead", 1) < 0)
15852 {
15853 // The function has no return value, the exception cannot
15854 // be reported to the caller, so just log it.
15855 PyErr_WriteUnraisable(NULL);
15856 }
15857
Benjamin Peterson14339b62009-01-31 16:36:08 +000015858 PyUnicode_InternInPlace(p);
15859 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015860 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015861 Py_INCREF(*p);
15862 }
Walter Dörwald16807132007-05-25 13:52:07 +000015863}
15864
15865PyObject *
15866PyUnicode_InternFromString(const char *cp)
15867{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015868 PyObject *s = PyUnicode_FromString(cp);
15869 if (s == NULL)
15870 return NULL;
15871 PyUnicode_InternInPlace(&s);
15872 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015873}
15874
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015875
Victor Stinner666ecfb2020-07-02 01:19:57 +020015876void
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015877_PyUnicode_ClearInterned(PyInterpreterState *interp)
Walter Dörwald16807132007-05-25 13:52:07 +000015878{
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015879 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinnerea251802020-12-26 02:58:33 +010015880 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015881 return;
15882 }
Victor Stinnerea251802020-12-26 02:58:33 +010015883 assert(PyDict_CheckExact(state->interned));
Victor Stinner666ecfb2020-07-02 01:19:57 +020015884
15885 /* Interned unicode strings are not forcibly deallocated; rather, we give
15886 them their stolen references back, and then clear and DECREF the
15887 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015888
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015889#ifdef INTERNED_STATS
Victor Stinnerea251802020-12-26 02:58:33 +010015890 fprintf(stderr, "releasing %zd interned strings\n",
15891 PyDict_GET_SIZE(state->interned));
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015892
15893 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015894#endif
Victor Stinnerea251802020-12-26 02:58:33 +010015895 Py_ssize_t pos = 0;
15896 PyObject *s, *ignored_value;
15897 while (PyDict_Next(state->interned, &pos, &s, &ignored_value)) {
Victor Stinner666ecfb2020-07-02 01:19:57 +020015898 assert(PyUnicode_IS_READY(s));
15899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015900 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015901 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015902 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015903#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015904 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015905#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015906 break;
15907 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015908 // Restore the two references (key and value) ignored
15909 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015910 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015911#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015912 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015913#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015914 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015915 case SSTATE_NOT_INTERNED:
15916 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015917 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015918 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015920 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015921 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015922#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015923 fprintf(stderr,
15924 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15925 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015926#endif
Victor Stinner666ecfb2020-07-02 01:19:57 +020015927
Victor Stinnerea251802020-12-26 02:58:33 +010015928 PyDict_Clear(state->interned);
15929 Py_CLEAR(state->interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015930}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015931
15932
15933/********************* Unicode Iterator **************************/
15934
15935typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015936 PyObject_HEAD
15937 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015938 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015939} unicodeiterobject;
15940
15941static void
15942unicodeiter_dealloc(unicodeiterobject *it)
15943{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015944 _PyObject_GC_UNTRACK(it);
15945 Py_XDECREF(it->it_seq);
15946 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015947}
15948
15949static int
15950unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15951{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015952 Py_VISIT(it->it_seq);
15953 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015954}
15955
15956static PyObject *
15957unicodeiter_next(unicodeiterobject *it)
15958{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015959 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015960
Benjamin Peterson14339b62009-01-31 16:36:08 +000015961 assert(it != NULL);
15962 seq = it->it_seq;
15963 if (seq == NULL)
15964 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015965 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015967 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15968 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015969 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015970 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15971 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015972 if (item != NULL)
15973 ++it->it_index;
15974 return item;
15975 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015976
Benjamin Peterson14339b62009-01-31 16:36:08 +000015977 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015978 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015979 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015980}
15981
15982static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015983unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015984{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015985 Py_ssize_t len = 0;
15986 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015987 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015988 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015989}
15990
15991PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15992
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015993static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015994unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015995{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015996 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015997 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015998 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015999 it->it_seq, it->it_index);
16000 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020016001 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016002 if (u == NULL)
16003 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020016004 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016005 }
16006}
16007
16008PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16009
16010static PyObject *
16011unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
16012{
16013 Py_ssize_t index = PyLong_AsSsize_t(state);
16014 if (index == -1 && PyErr_Occurred())
16015 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000016016 if (it->it_seq != NULL) {
16017 if (index < 0)
16018 index = 0;
16019 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16020 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
16021 it->it_index = index;
16022 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016023 Py_RETURN_NONE;
16024}
16025
16026PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16027
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016028static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000016029 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000016030 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016031 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
16032 reduce_doc},
16033 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
16034 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000016035 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016036};
16037
16038PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000016039 PyVarObject_HEAD_INIT(&PyType_Type, 0)
16040 "str_iterator", /* tp_name */
16041 sizeof(unicodeiterobject), /* tp_basicsize */
16042 0, /* tp_itemsize */
16043 /* methods */
16044 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020016045 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000016046 0, /* tp_getattr */
16047 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020016048 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000016049 0, /* tp_repr */
16050 0, /* tp_as_number */
16051 0, /* tp_as_sequence */
16052 0, /* tp_as_mapping */
16053 0, /* tp_hash */
16054 0, /* tp_call */
16055 0, /* tp_str */
16056 PyObject_GenericGetAttr, /* tp_getattro */
16057 0, /* tp_setattro */
16058 0, /* tp_as_buffer */
16059 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16060 0, /* tp_doc */
16061 (traverseproc)unicodeiter_traverse, /* tp_traverse */
16062 0, /* tp_clear */
16063 0, /* tp_richcompare */
16064 0, /* tp_weaklistoffset */
16065 PyObject_SelfIter, /* tp_iter */
16066 (iternextfunc)unicodeiter_next, /* tp_iternext */
16067 unicodeiter_methods, /* tp_methods */
16068 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016069};
16070
16071static PyObject *
16072unicode_iter(PyObject *seq)
16073{
Benjamin Peterson14339b62009-01-31 16:36:08 +000016074 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016075
Benjamin Peterson14339b62009-01-31 16:36:08 +000016076 if (!PyUnicode_Check(seq)) {
16077 PyErr_BadInternalCall();
16078 return NULL;
16079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020016080 if (PyUnicode_READY(seq) == -1)
16081 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016082 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16083 if (it == NULL)
16084 return NULL;
16085 it->it_index = 0;
16086 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020016087 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016088 _PyObject_GC_TRACK(it);
16089 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016090}
16091
Victor Stinner709d23d2019-05-02 14:56:30 -040016092static int
16093encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016094{
Victor Stinner709d23d2019-05-02 14:56:30 -040016095 int res;
16096 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16097 if (res == -2) {
16098 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16099 return -1;
16100 }
16101 if (res < 0) {
16102 PyErr_NoMemory();
16103 return -1;
16104 }
16105 return 0;
16106}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016107
Victor Stinner709d23d2019-05-02 14:56:30 -040016108
16109static int
16110config_get_codec_name(wchar_t **config_encoding)
16111{
16112 char *encoding;
16113 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16114 return -1;
16115 }
16116
16117 PyObject *name_obj = NULL;
16118 PyObject *codec = _PyCodec_Lookup(encoding);
16119 PyMem_RawFree(encoding);
16120
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016121 if (!codec)
16122 goto error;
16123
16124 name_obj = PyObject_GetAttrString(codec, "name");
16125 Py_CLEAR(codec);
16126 if (!name_obj) {
16127 goto error;
16128 }
16129
Victor Stinner709d23d2019-05-02 14:56:30 -040016130 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16131 Py_DECREF(name_obj);
16132 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016133 goto error;
16134 }
16135
Victor Stinner709d23d2019-05-02 14:56:30 -040016136 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16137 if (raw_wname == NULL) {
16138 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016139 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016140 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016141 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016142
16143 PyMem_RawFree(*config_encoding);
16144 *config_encoding = raw_wname;
16145
16146 PyMem_Free(wname);
16147 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016148
16149error:
16150 Py_XDECREF(codec);
16151 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016152 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016153}
16154
16155
Victor Stinner331a6a52019-05-27 16:39:22 +020016156static PyStatus
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016157init_stdio_encoding(PyInterpreterState *interp)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016158{
Victor Stinner709d23d2019-05-02 14:56:30 -040016159 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016160 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016161 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016162 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016163 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016164 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016165 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016166}
16167
16168
Victor Stinner709d23d2019-05-02 14:56:30 -040016169static int
16170init_fs_codec(PyInterpreterState *interp)
16171{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016172 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016173
16174 _Py_error_handler error_handler;
16175 error_handler = get_error_handler_wide(config->filesystem_errors);
16176 if (error_handler == _Py_ERROR_UNKNOWN) {
Christian Claussdcfbe4f2021-10-07 16:31:33 +020016177 PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
Victor Stinner709d23d2019-05-02 14:56:30 -040016178 return -1;
16179 }
16180
16181 char *encoding, *errors;
16182 if (encode_wstr_utf8(config->filesystem_encoding,
16183 &encoding,
16184 "filesystem_encoding") < 0) {
16185 return -1;
16186 }
16187
16188 if (encode_wstr_utf8(config->filesystem_errors,
16189 &errors,
16190 "filesystem_errors") < 0) {
16191 PyMem_RawFree(encoding);
16192 return -1;
16193 }
16194
Victor Stinner3d17c042020-05-14 01:48:38 +020016195 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16196 PyMem_RawFree(fs_codec->encoding);
16197 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016198 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016199 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16200 PyMem_RawFree(fs_codec->errors);
16201 fs_codec->errors = errors;
16202 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016203
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016204#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016205 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016206#endif
16207
Victor Stinner709d23d2019-05-02 14:56:30 -040016208 /* At this point, PyUnicode_EncodeFSDefault() and
16209 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16210 the C implementation of the filesystem encoding. */
16211
16212 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16213 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016214 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16215 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016216 PyErr_NoMemory();
16217 return -1;
16218 }
16219 return 0;
16220}
16221
16222
Victor Stinner331a6a52019-05-27 16:39:22 +020016223static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016224init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016225{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016226 PyInterpreterState *interp = tstate->interp;
16227
Victor Stinner709d23d2019-05-02 14:56:30 -040016228 /* Update the filesystem encoding to the normalized Python codec name.
16229 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16230 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016231 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016232 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016233 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016234 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016235 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016236 }
16237
Victor Stinner709d23d2019-05-02 14:56:30 -040016238 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016239 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016240 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016241 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016242}
16243
16244
Victor Stinner331a6a52019-05-27 16:39:22 +020016245PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016246_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016247{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016248 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016249 if (_PyStatus_EXCEPTION(status)) {
16250 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016251 }
16252
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016253 return init_stdio_encoding(tstate->interp);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016254}
16255
16256
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016257static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016258_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016259{
Victor Stinner3d17c042020-05-14 01:48:38 +020016260 PyMem_RawFree(fs_codec->encoding);
16261 fs_codec->encoding = NULL;
16262 fs_codec->utf8 = 0;
16263 PyMem_RawFree(fs_codec->errors);
16264 fs_codec->errors = NULL;
16265 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016266}
16267
16268
Victor Stinner709d23d2019-05-02 14:56:30 -040016269#ifdef MS_WINDOWS
16270int
16271_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16272{
Victor Stinner81a7be32020-04-14 15:14:01 +020016273 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016274 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016275
16276 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16277 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16278 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16279 if (encoding == NULL || errors == NULL) {
16280 PyMem_RawFree(encoding);
16281 PyMem_RawFree(errors);
16282 PyErr_NoMemory();
16283 return -1;
16284 }
16285
16286 PyMem_RawFree(config->filesystem_encoding);
16287 config->filesystem_encoding = encoding;
16288 PyMem_RawFree(config->filesystem_errors);
16289 config->filesystem_errors = errors;
16290
16291 return init_fs_codec(interp);
16292}
16293#endif
16294
16295
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016296void
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016297_PyUnicode_Fini(PyInterpreterState *interp)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016298{
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016299 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016300
Victor Stinnerea251802020-12-26 02:58:33 +010016301 // _PyUnicode_ClearInterned() must be called before
16302 assert(state->interned == NULL);
16303
16304 _PyUnicode_FiniEncodings(&state->fs_codec);
16305
Victor Stinnerf4507232020-12-26 20:26:08 +010016306 unicode_clear_identifiers(state);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016307
Victor Stinner2f9ada92020-06-24 02:22:21 +020016308 for (Py_ssize_t i = 0; i < 256; i++) {
16309 Py_CLEAR(state->latin1[i]);
16310 }
Victor Stinnerea251802020-12-26 02:58:33 +010016311 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016312}
16313
16314
Georg Brandl66c221e2010-10-14 07:04:07 +000016315/* A _string module, to export formatter_parser and formatter_field_name_split
16316 to the string.Formatter class implemented in Python. */
16317
16318static PyMethodDef _string_methods[] = {
16319 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16320 METH_O, PyDoc_STR("split the argument as a field name")},
16321 {"formatter_parser", (PyCFunction) formatter_parser,
16322 METH_O, PyDoc_STR("parse the argument as a format string")},
16323 {NULL, NULL}
16324};
16325
16326static struct PyModuleDef _string_module = {
16327 PyModuleDef_HEAD_INIT,
Victor Stinnerbb083d32020-09-08 15:33:08 +020016328 .m_name = "_string",
16329 .m_doc = PyDoc_STR("string helper module"),
16330 .m_size = 0,
16331 .m_methods = _string_methods,
Georg Brandl66c221e2010-10-14 07:04:07 +000016332};
16333
16334PyMODINIT_FUNC
16335PyInit__string(void)
16336{
Victor Stinnerbb083d32020-09-08 15:33:08 +020016337 return PyModuleDef_Init(&_string_module);
Georg Brandl66c221e2010-10-14 07:04:07 +000016338}
16339
16340
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016341#ifdef __cplusplus
16342}
16343#endif