blob: 342e7eec0616431168eba602510c59da36955c0c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinnerba3d67c2020-12-26 00:41:46 +010044#include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
Victor Stinner47e1afd2020-10-26 16:43:47 +010045#include "pycore_bytes_methods.h" // _Py_bytes_lower()
Serhiy Storchaka2ad93822020-12-03 12:46:16 +020046#include "pycore_format.h" // F_LJUST
Victor Stinner47e1afd2020-10-26 16:43:47 +010047#include "pycore_initconfig.h" // _PyStatus_OK()
48#include "pycore_interp.h" // PyInterpreterState.fs_codec
49#include "pycore_object.h" // _PyObject_GC_TRACK()
50#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
51#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
52#include "pycore_pystate.h" // _PyInterpreterState_GET()
53#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
54#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000056#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000057#include <windows.h>
58#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059
Jakub Kulík9032cf52021-04-30 15:21:42 +020060#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
61#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
62#endif
63
Victor Stinner666ecfb2020-07-02 01:19:57 +020064/* Uncomment to display statistics on interned strings at exit
65 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010066/* #define INTERNED_STATS 1 */
67
68
Larry Hastings61272b72014-01-07 12:41:53 -080069/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090070class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080071[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090072/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
73
74/*[python input]
75class Py_UCS4_converter(CConverter):
76 type = 'Py_UCS4'
77 converter = 'convert_uc'
78
79 def converter_init(self):
80 if self.default is not unspecified:
81 self.c_default = ascii(self.default)
82 if len(self.c_default) > 4 or self.c_default[0] != "'":
83 self.c_default = hex(ord(self.default))
84
85[python start generated code]*/
86/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080087
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088/* --- Globals ------------------------------------------------------------
89
Serhiy Storchaka05997252013-01-26 12:14:02 +020090NOTE: In the interpreter's initialization phase, some globals are currently
91 initialized dynamically as needed. In the process Unicode objects may
92 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000093
94*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000095
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000096
97#ifdef __cplusplus
98extern "C" {
99#endif
100
Victor Stinner99768342021-03-17 21:46:53 +0100101// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
102// The value must be the same in fileutils.c.
Victor Stinner8faf8212011-12-08 22:14:11 +0100103#define MAX_UNICODE 0x10ffff
104
Victor Stinner910337b2011-10-03 03:20:16 +0200105#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200106# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#else
108# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
109#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200110
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111#define _PyUnicode_UTF8(op) \
112 (((PyCompactUnicodeObject*)(op))->utf8)
113#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200114 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((char*)((PyASCIIObject*)(op) + 1)) : \
118 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200119#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200120 (((PyCompactUnicodeObject*)(op))->utf8_length)
121#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200122 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200123 assert(PyUnicode_IS_READY(op)), \
124 PyUnicode_IS_COMPACT_ASCII(op) ? \
125 ((PyASCIIObject*)(op))->length : \
126 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200127#define _PyUnicode_WSTR(op) \
128 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900129
130/* Don't use deprecated macro of unicodeobject.h */
131#undef PyUnicode_WSTR_LENGTH
132#define PyUnicode_WSTR_LENGTH(op) \
133 (PyUnicode_IS_COMPACT_ASCII(op) ? \
134 ((PyASCIIObject*)op)->length : \
135 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200136#define _PyUnicode_WSTR_LENGTH(op) \
137 (((PyCompactUnicodeObject*)(op))->wstr_length)
138#define _PyUnicode_LENGTH(op) \
139 (((PyASCIIObject *)(op))->length)
140#define _PyUnicode_STATE(op) \
141 (((PyASCIIObject *)(op))->state)
142#define _PyUnicode_HASH(op) \
143 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200144#define _PyUnicode_KIND(op) \
145 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200146 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200147#define _PyUnicode_GET_LENGTH(op) \
148 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200149 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200150#define _PyUnicode_DATA_ANY(op) \
151 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200152
Victor Stinner910337b2011-10-03 03:20:16 +0200153#undef PyUnicode_READY
154#define PyUnicode_READY(op) \
155 (assert(_PyUnicode_CHECK(op)), \
156 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200157 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100158 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200159
Victor Stinnerc379ead2011-10-03 12:52:27 +0200160#define _PyUnicode_SHARE_UTF8(op) \
161 (assert(_PyUnicode_CHECK(op)), \
162 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
163 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
164#define _PyUnicode_SHARE_WSTR(op) \
165 (assert(_PyUnicode_CHECK(op)), \
166 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
167
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168/* true if the Unicode object has an allocated UTF-8 memory block
169 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200170#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
174
Victor Stinner03490912011-10-03 23:45:12 +0200175/* true if the Unicode object has an allocated wstr memory block
176 (not shared with other data) */
177#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200178 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200179 (!PyUnicode_IS_READY(op) || \
180 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
181
Victor Stinner910337b2011-10-03 03:20:16 +0200182/* Generic helper macro to convert characters of different types.
183 from_type and to_type have to be valid type names, begin and end
184 are pointers to the source characters which should be of type
185 "from_type *". to is a pointer of type "to_type *" and points to the
186 buffer where the result characters are written to. */
187#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
188 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100189 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600190 const from_type *_iter = (const from_type *)(begin);\
191 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200192 Py_ssize_t n = (_end) - (_iter); \
193 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200194 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_unrolled_end)) { \
196 _to[0] = (to_type) _iter[0]; \
197 _to[1] = (to_type) _iter[1]; \
198 _to[2] = (to_type) _iter[2]; \
199 _to[3] = (to_type) _iter[3]; \
200 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200201 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200202 while (_iter < (_end)) \
203 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200204 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200205
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200206#ifdef MS_WINDOWS
207 /* On Windows, overallocate by 50% is the best factor */
208# define OVERALLOCATE_FACTOR 2
209#else
210 /* On Linux, overallocate by 25% is the best factor */
211# define OVERALLOCATE_FACTOR 4
212#endif
213
Walter Dörwald16807132007-05-25 13:52:07 +0000214
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200215static struct _Py_unicode_state*
216get_unicode_state(void)
217{
218 PyInterpreterState *interp = _PyInterpreterState_GET();
219 return &interp->unicode;
220}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200223// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static inline PyObject* unicode_get_empty(void)
225{
226 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200227 // unicode_get_empty() must not be called before _PyUnicode_Init()
228 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200229 assert(state->empty_string != NULL);
230 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200231}
232
Victor Stinner91698d82020-06-25 14:07:40 +0200233
234// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200235static inline PyObject* unicode_new_empty(void)
236{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200237 PyObject *empty = unicode_get_empty();
238 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200239 return empty;
240}
241
242#define _Py_RETURN_UNICODE_EMPTY() \
243 do { \
244 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200245 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000246
Victor Stinner59423e32018-11-26 13:40:01 +0100247static inline void
248unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
249 Py_ssize_t start, Py_ssize_t length)
250{
251 assert(0 <= start);
252 assert(kind != PyUnicode_WCHAR_KIND);
253 switch (kind) {
254 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS1 ch = (unsigned char)value;
257 Py_UCS1 *to = (Py_UCS1 *)data + start;
258 memset(to, ch, length);
259 break;
260 }
261 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100262 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100263 Py_UCS2 ch = (Py_UCS2)value;
264 Py_UCS2 *to = (Py_UCS2 *)data + start;
265 const Py_UCS2 *end = to + length;
266 for (; to < end; ++to) *to = ch;
267 break;
268 }
269 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100270 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100271 Py_UCS4 ch = value;
272 Py_UCS4 * to = (Py_UCS4 *)data + start;
273 const Py_UCS4 *end = to + length;
274 for (; to < end; ++to) *to = ch;
275 break;
276 }
277 default: Py_UNREACHABLE();
278 }
279}
280
281
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200282/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700283static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200284_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900285static inline void
286_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400287static PyObject *
288unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
289 const char *errors);
290static PyObject *
291unicode_decode_utf8(const char *s, Py_ssize_t size,
292 _Py_error_handler error_handler, const char *errors,
293 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200294
Christian Heimes190d79e2008-01-30 11:58:22 +0000295/* Fast detection of the most frequent whitespace characters */
296const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000298/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000299/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000300/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x000C: * FORM FEED */
302/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 0, 1, 1, 1, 1, 1, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000305/* case 0x001C: * FILE SEPARATOR */
306/* case 0x001D: * GROUP SEPARATOR */
307/* case 0x001E: * RECORD SEPARATOR */
308/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000310/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000311 1, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000315
Benjamin Peterson14339b62009-01-31 16:36:08 +0000316 0, 0, 0, 0, 0, 0, 0, 0,
317 0, 0, 0, 0, 0, 0, 0, 0,
318 0, 0, 0, 0, 0, 0, 0, 0,
319 0, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321 0, 0, 0, 0, 0, 0, 0, 0,
322 0, 0, 0, 0, 0, 0, 0, 0,
323 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000324};
325
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200326/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200327static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200328static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100329static int unicode_modifiable(PyObject *unicode);
330
Victor Stinnerfe226c02011-10-03 03:52:20 +0200331
Alexander Belopolsky40018472011-02-26 01:02:56 +0000332static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100333_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200334static PyObject *
335_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
336static PyObject *
337_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
338
339static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000340unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000341 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100342 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000343 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
344
Alexander Belopolsky40018472011-02-26 01:02:56 +0000345static void
346raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300347 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100348 PyObject *unicode,
349 Py_ssize_t startpos, Py_ssize_t endpos,
350 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000351
Christian Heimes190d79e2008-01-30 11:58:22 +0000352/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200353static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000355/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000356/* 0x000B, * LINE TABULATION */
357/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000358/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000359 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000360 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000361/* 0x001C, * FILE SEPARATOR */
362/* 0x001D, * GROUP SEPARATOR */
363/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000364 0, 0, 0, 0, 1, 1, 1, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000369
Benjamin Peterson14339b62009-01-31 16:36:08 +0000370 0, 0, 0, 0, 0, 0, 0, 0,
371 0, 0, 0, 0, 0, 0, 0, 0,
372 0, 0, 0, 0, 0, 0, 0, 0,
373 0, 0, 0, 0, 0, 0, 0, 0,
374 0, 0, 0, 0, 0, 0, 0, 0,
375 0, 0, 0, 0, 0, 0, 0, 0,
376 0, 0, 0, 0, 0, 0, 0, 0,
377 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000378};
379
INADA Naoki3ae20562017-01-16 20:41:20 +0900380static int convert_uc(PyObject *obj, void *addr);
381
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300382#include "clinic/unicodeobject.c.h"
383
Victor Stinner3d4226a2018-08-29 22:21:32 +0200384_Py_error_handler
385_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200386{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200387 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200388 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200391 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200394 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
396 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200397 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 }
399 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200400 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200403 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
Victor Stinner50149202015-09-22 00:26:54 +0200408 return _Py_ERROR_OTHER;
409}
410
Victor Stinner709d23d2019-05-02 14:56:30 -0400411
412static _Py_error_handler
413get_error_handler_wide(const wchar_t *errors)
414{
415 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
416 return _Py_ERROR_STRICT;
417 }
418 if (wcscmp(errors, L"surrogateescape") == 0) {
419 return _Py_ERROR_SURROGATEESCAPE;
420 }
421 if (wcscmp(errors, L"replace") == 0) {
422 return _Py_ERROR_REPLACE;
423 }
424 if (wcscmp(errors, L"ignore") == 0) {
425 return _Py_ERROR_IGNORE;
426 }
427 if (wcscmp(errors, L"backslashreplace") == 0) {
428 return _Py_ERROR_BACKSLASHREPLACE;
429 }
430 if (wcscmp(errors, L"surrogatepass") == 0) {
431 return _Py_ERROR_SURROGATEPASS;
432 }
433 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
434 return _Py_ERROR_XMLCHARREFREPLACE;
435 }
436 return _Py_ERROR_OTHER;
437}
438
439
Victor Stinner22eb6892019-06-26 00:51:05 +0200440static inline int
441unicode_check_encoding_errors(const char *encoding, const char *errors)
442{
443 if (encoding == NULL && errors == NULL) {
444 return 0;
445 }
446
Victor Stinner81a7be32020-04-14 15:14:01 +0200447 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200448#ifndef Py_DEBUG
449 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200450 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200451 return 0;
452 }
453#else
454 /* Always check in debug mode */
455#endif
456
457 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
458 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200459 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200460 return 0;
461 }
462
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200463 /* Disable checks during Python finalization. For example, it allows to
464 call _PyObject_Dump() during finalization for debugging purpose. */
465 if (interp->finalizing) {
466 return 0;
467 }
468
Victor Stinner22eb6892019-06-26 00:51:05 +0200469 if (encoding != NULL) {
470 PyObject *handler = _PyCodec_Lookup(encoding);
471 if (handler == NULL) {
472 return -1;
473 }
474 Py_DECREF(handler);
475 }
476
477 if (errors != NULL) {
478 PyObject *handler = PyCodec_LookupError(errors);
479 if (handler == NULL) {
480 return -1;
481 }
482 Py_DECREF(handler);
483 }
484 return 0;
485}
486
487
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200488int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100489_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200490{
Victor Stinner68762572019-10-07 18:42:01 +0200491#define CHECK(expr) \
492 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
493
Victor Stinner910337b2011-10-03 03:20:16 +0200494 PyASCIIObject *ascii;
495 unsigned int kind;
496
Victor Stinner68762572019-10-07 18:42:01 +0200497 assert(op != NULL);
498 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200499
500 ascii = (PyASCIIObject *)op;
501 kind = ascii->state.kind;
502
Victor Stinnera3b334d2011-10-03 13:53:37 +0200503 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200504 CHECK(kind == PyUnicode_1BYTE_KIND);
505 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200506 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200507 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200508 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200509 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200510
Victor Stinnera41463c2011-10-04 01:05:08 +0200511 if (ascii->state.compact == 1) {
512 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200513 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200514 || kind == PyUnicode_2BYTE_KIND
515 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200516 CHECK(ascii->state.ascii == 0);
517 CHECK(ascii->state.ready == 1);
518 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100519 }
520 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200521 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
522
523 data = unicode->data.any;
524 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(ascii->length == 0);
526 CHECK(ascii->hash == -1);
527 CHECK(ascii->state.compact == 0);
528 CHECK(ascii->state.ascii == 0);
529 CHECK(ascii->state.ready == 0);
530 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
531 CHECK(ascii->wstr != NULL);
532 CHECK(data == NULL);
533 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200534 }
535 else {
Victor Stinner68762572019-10-07 18:42:01 +0200536 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200537 || kind == PyUnicode_2BYTE_KIND
538 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200539 CHECK(ascii->state.compact == 0);
540 CHECK(ascii->state.ready == 1);
541 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200543 CHECK(compact->utf8 == data);
544 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200545 }
546 else
Victor Stinner68762572019-10-07 18:42:01 +0200547 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200548 }
549 }
550 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200551 if (
552#if SIZEOF_WCHAR_T == 2
553 kind == PyUnicode_2BYTE_KIND
554#else
555 kind == PyUnicode_4BYTE_KIND
556#endif
557 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200558 {
Victor Stinner68762572019-10-07 18:42:01 +0200559 CHECK(ascii->wstr == data);
560 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200561 } else
Victor Stinner68762572019-10-07 18:42:01 +0200562 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200563 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200564
565 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200566 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200567 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200569 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200570
571 /* check that the best kind is used: O(n) operation */
572 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200573 Py_ssize_t i;
574 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300575 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200576 Py_UCS4 ch;
577
578 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200579 for (i=0; i < ascii->length; i++)
580 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200581 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200582 if (ch > maxchar)
583 maxchar = ch;
584 }
585 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100586 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200587 CHECK(maxchar >= 128);
588 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100589 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200590 else
Victor Stinner68762572019-10-07 18:42:01 +0200591 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200592 }
Victor Stinner77faf692011-11-20 18:56:05 +0100593 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200594 CHECK(maxchar >= 0x100);
595 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100596 }
597 else {
Victor Stinner68762572019-10-07 18:42:01 +0200598 CHECK(maxchar >= 0x10000);
599 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100600 }
Victor Stinner68762572019-10-07 18:42:01 +0200601 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400603 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200604
605#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400606}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200607
Victor Stinner910337b2011-10-03 03:20:16 +0200608
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100609static PyObject*
610unicode_result_wchar(PyObject *unicode)
611{
612#ifndef Py_DEBUG
613 Py_ssize_t len;
614
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100615 len = _PyUnicode_WSTR_LENGTH(unicode);
616 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100617 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200618 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100619 }
620
621 if (len == 1) {
622 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100623 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100624 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200625 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100626 }
627 }
628
629 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200630 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 return NULL;
632 }
633#else
Victor Stinneraa771272012-10-04 02:32:58 +0200634 assert(Py_REFCNT(unicode) == 1);
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 /* don't make the result ready in debug mode to ensure that the caller
637 makes the string ready before using it */
638 assert(_PyUnicode_CheckConsistency(unicode, 1));
639#endif
640 return unicode;
641}
642
643static PyObject*
644unicode_result_ready(PyObject *unicode)
645{
646 Py_ssize_t length;
647
648 length = PyUnicode_GET_LENGTH(unicode);
649 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200650 PyObject *empty = unicode_get_empty();
651 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100652 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200653 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100654 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200655 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100656 }
657
658 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200659 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200660 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakac43317d2021-06-12 20:44:32 +0300661 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200662 Py_UCS1 ch = data[0];
663 struct _Py_unicode_state *state = get_unicode_state();
664 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100665 if (latin1_char != NULL) {
666 if (unicode != latin1_char) {
667 Py_INCREF(latin1_char);
668 Py_DECREF(unicode);
669 }
670 return latin1_char;
671 }
672 else {
673 assert(_PyUnicode_CheckConsistency(unicode, 1));
674 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200675 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100676 return unicode;
677 }
678 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200679 else {
680 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
681 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100682 }
683
684 assert(_PyUnicode_CheckConsistency(unicode, 1));
685 return unicode;
686}
687
688static PyObject*
689unicode_result(PyObject *unicode)
690{
691 assert(_PyUnicode_CHECK(unicode));
692 if (PyUnicode_IS_READY(unicode))
693 return unicode_result_ready(unicode);
694 else
695 return unicode_result_wchar(unicode);
696}
697
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698static PyObject*
699unicode_result_unchanged(PyObject *unicode)
700{
701 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500702 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100703 return NULL;
704 Py_INCREF(unicode);
705 return unicode;
706 }
707 else
708 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100709 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100710}
711
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200712/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
713 ASCII, Latin1, UTF-8, etc. */
714static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200715backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200716 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
717{
Victor Stinnerad771582015-10-09 12:38:53 +0200718 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200719 Py_UCS4 ch;
720 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300721 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200722
723 assert(PyUnicode_IS_READY(unicode));
724 kind = PyUnicode_KIND(unicode);
725 data = PyUnicode_DATA(unicode);
726
727 size = 0;
728 /* determine replacement size */
729 for (i = collstart; i < collend; ++i) {
730 Py_ssize_t incr;
731
732 ch = PyUnicode_READ(kind, data, i);
733 if (ch < 0x100)
734 incr = 2+2;
735 else if (ch < 0x10000)
736 incr = 2+4;
737 else {
738 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200739 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200740 }
741 if (size > PY_SSIZE_T_MAX - incr) {
742 PyErr_SetString(PyExc_OverflowError,
743 "encoded result is too long for a Python string");
744 return NULL;
745 }
746 size += incr;
747 }
748
Victor Stinnerad771582015-10-09 12:38:53 +0200749 str = _PyBytesWriter_Prepare(writer, str, size);
750 if (str == NULL)
751 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200752
753 /* generate replacement */
754 for (i = collstart; i < collend; ++i) {
755 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200756 *str++ = '\\';
757 if (ch >= 0x00010000) {
758 *str++ = 'U';
759 *str++ = Py_hexdigits[(ch>>28)&0xf];
760 *str++ = Py_hexdigits[(ch>>24)&0xf];
761 *str++ = Py_hexdigits[(ch>>20)&0xf];
762 *str++ = Py_hexdigits[(ch>>16)&0xf];
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200765 }
Victor Stinner797485e2015-10-09 03:17:30 +0200766 else if (ch >= 0x100) {
767 *str++ = 'u';
768 *str++ = Py_hexdigits[(ch>>12)&0xf];
769 *str++ = Py_hexdigits[(ch>>8)&0xf];
770 }
771 else
772 *str++ = 'x';
773 *str++ = Py_hexdigits[(ch>>4)&0xf];
774 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200775 }
776 return str;
777}
778
779/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
780 ASCII, Latin1, UTF-8, etc. */
781static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200782xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200783 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
784{
Victor Stinnerad771582015-10-09 12:38:53 +0200785 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200786 Py_UCS4 ch;
787 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300788 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200789
790 assert(PyUnicode_IS_READY(unicode));
791 kind = PyUnicode_KIND(unicode);
792 data = PyUnicode_DATA(unicode);
793
794 size = 0;
795 /* determine replacement size */
796 for (i = collstart; i < collend; ++i) {
797 Py_ssize_t incr;
798
799 ch = PyUnicode_READ(kind, data, i);
800 if (ch < 10)
801 incr = 2+1+1;
802 else if (ch < 100)
803 incr = 2+2+1;
804 else if (ch < 1000)
805 incr = 2+3+1;
806 else if (ch < 10000)
807 incr = 2+4+1;
808 else if (ch < 100000)
809 incr = 2+5+1;
810 else if (ch < 1000000)
811 incr = 2+6+1;
812 else {
813 assert(ch <= MAX_UNICODE);
814 incr = 2+7+1;
815 }
816 if (size > PY_SSIZE_T_MAX - incr) {
817 PyErr_SetString(PyExc_OverflowError,
818 "encoded result is too long for a Python string");
819 return NULL;
820 }
821 size += incr;
822 }
823
Victor Stinnerad771582015-10-09 12:38:53 +0200824 str = _PyBytesWriter_Prepare(writer, str, size);
825 if (str == NULL)
826 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200827
828 /* generate replacement */
829 for (i = collstart; i < collend; ++i) {
Christian Heimes07f2ade2020-11-18 16:38:53 +0100830 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
831 if (size < 0) {
832 return NULL;
833 }
834 str += size;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200835 }
836 return str;
837}
838
Thomas Wouters477c8d52006-05-27 19:21:47 +0000839/* --- Bloom Filters ----------------------------------------------------- */
840
841/* stuff to implement simple "bloom filters" for Unicode characters.
842 to keep things simple, we use a single bitmask, using the least 5
843 bits from each unicode characters as the bit index. */
844
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200845/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000846
Antoine Pitrouf068f942010-01-13 14:19:12 +0000847#if LONG_BIT >= 128
848#define BLOOM_WIDTH 128
849#elif LONG_BIT >= 64
850#define BLOOM_WIDTH 64
851#elif LONG_BIT >= 32
852#define BLOOM_WIDTH 32
853#else
854#error "LONG_BIT is smaller than 32"
855#endif
856
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857#define BLOOM_MASK unsigned long
858
Serhiy Storchaka05997252013-01-26 12:14:02 +0200859static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860
Antoine Pitrouf068f942010-01-13 14:19:12 +0000861#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000862
Benjamin Peterson29060642009-01-31 22:14:21 +0000863#define BLOOM_LINEBREAK(ch) \
864 ((ch) < 128U ? ascii_linebreak[(ch)] : \
865 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000866
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700867static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300868make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000869{
Victor Stinnera85af502013-04-09 21:53:54 +0200870#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
871 do { \
872 TYPE *data = (TYPE *)PTR; \
873 TYPE *end = data + LEN; \
874 Py_UCS4 ch; \
875 for (; data != end; data++) { \
876 ch = *data; \
877 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
878 } \
879 break; \
880 } while (0)
881
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882 /* calculate simple bloom-style bitmask for a given unicode string */
883
Antoine Pitrouf068f942010-01-13 14:19:12 +0000884 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000885
886 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200887 switch (kind) {
888 case PyUnicode_1BYTE_KIND:
889 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
890 break;
891 case PyUnicode_2BYTE_KIND:
892 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
893 break;
894 case PyUnicode_4BYTE_KIND:
895 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
896 break;
897 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700898 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200899 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000900 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200901
902#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000903}
904
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300905static int
906ensure_unicode(PyObject *obj)
907{
908 if (!PyUnicode_Check(obj)) {
909 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200910 "must be str, not %.100s",
911 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300912 return -1;
913 }
914 return PyUnicode_READY(obj);
915}
916
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200917/* Compilation of templated routines */
918
Victor Stinner90ed8a62020-06-24 00:34:07 +0200919#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200920
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200921#include "stringlib/asciilib.h"
922#include "stringlib/fastsearch.h"
923#include "stringlib/partition.h"
924#include "stringlib/split.h"
925#include "stringlib/count.h"
926#include "stringlib/find.h"
927#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200928#include "stringlib/undef.h"
929
930#include "stringlib/ucs1lib.h"
931#include "stringlib/fastsearch.h"
932#include "stringlib/partition.h"
933#include "stringlib/split.h"
934#include "stringlib/count.h"
935#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300936#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200937#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200938#include "stringlib/undef.h"
939
940#include "stringlib/ucs2lib.h"
941#include "stringlib/fastsearch.h"
942#include "stringlib/partition.h"
943#include "stringlib/split.h"
944#include "stringlib/count.h"
945#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300946#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200947#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200948#include "stringlib/undef.h"
949
950#include "stringlib/ucs4lib.h"
951#include "stringlib/fastsearch.h"
952#include "stringlib/partition.h"
953#include "stringlib/split.h"
954#include "stringlib/count.h"
955#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300956#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200957#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200958#include "stringlib/undef.h"
959
Inada Naoki2c4928d2020-06-17 20:09:44 +0900960_Py_COMP_DIAG_PUSH
961_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200962#include "stringlib/unicodedefs.h"
963#include "stringlib/fastsearch.h"
964#include "stringlib/count.h"
965#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100966#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900967_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200968
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200969#undef STRINGLIB_GET_EMPTY
970
Guido van Rossumd57fd912000-03-10 22:53:23 +0000971/* --- Unicode Object ----------------------------------------------------- */
972
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700973static inline Py_ssize_t
974findchar(const void *s, int kind,
975 Py_ssize_t size, Py_UCS4 ch,
976 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200978 switch (kind) {
979 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200980 if ((Py_UCS1) ch != ch)
981 return -1;
982 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600983 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200984 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600985 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200987 if ((Py_UCS2) ch != ch)
988 return -1;
989 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600990 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200991 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600992 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200993 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200994 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600995 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200996 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600997 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200998 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700999 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001}
1002
Victor Stinnerafffce42012-10-03 23:03:17 +02001003#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001004/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001005 earlier.
1006
1007 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1008 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1009 invalid character in Unicode 6.0. */
1010static void
1011unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1012{
1013 int kind = PyUnicode_KIND(unicode);
1014 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1015 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1016 if (length <= old_length)
1017 return;
1018 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1019}
1020#endif
1021
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022static PyObject*
1023resize_compact(PyObject *unicode, Py_ssize_t length)
1024{
1025 Py_ssize_t char_size;
1026 Py_ssize_t struct_size;
1027 Py_ssize_t new_size;
1028 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001029 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001030#ifdef Py_DEBUG
1031 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1032#endif
1033
Victor Stinner79891572012-05-03 13:43:07 +02001034 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001036 assert(PyUnicode_IS_COMPACT(unicode));
1037
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001038 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001039 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 struct_size = sizeof(PyASCIIObject);
1041 else
1042 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001043 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001044
Victor Stinnerfe226c02011-10-03 03:52:20 +02001045 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1046 PyErr_NoMemory();
1047 return NULL;
1048 }
1049 new_size = (struct_size + (length + 1) * char_size);
1050
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001051 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001052 PyObject_Free(_PyUnicode_UTF8(unicode));
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001053 _PyUnicode_UTF8(unicode) = NULL;
1054 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1055 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001056#ifdef Py_REF_DEBUG
1057 _Py_RefTotal--;
1058#endif
1059#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001060 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001061#endif
Victor Stinner84def372011-12-11 20:04:56 +01001062
Victor Stinner32bd68c2020-12-01 10:37:39 +01001063 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001064 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001065 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001066 PyErr_NoMemory();
1067 return NULL;
1068 }
Victor Stinner84def372011-12-11 20:04:56 +01001069 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001071
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001073 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001075 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001076 _PyUnicode_WSTR_LENGTH(unicode) = length;
1077 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001078 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001079 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001080 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001081 if (!PyUnicode_IS_ASCII(unicode))
1082 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001083 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001084#ifdef Py_DEBUG
1085 unicode_fill_invalid(unicode, old_length);
1086#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001087 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1088 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001089 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001090 return unicode;
1091}
1092
Alexander Belopolsky40018472011-02-26 01:02:56 +00001093static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001094resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095{
Victor Stinner95663112011-10-04 01:03:50 +02001096 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001097 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001099 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001100
Victor Stinnerfe226c02011-10-03 03:52:20 +02001101 if (PyUnicode_IS_READY(unicode)) {
1102 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001103 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001104 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001105#ifdef Py_DEBUG
1106 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1107#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001108
1109 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001110 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001111 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1112 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001113
1114 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1115 PyErr_NoMemory();
1116 return -1;
1117 }
1118 new_size = (length + 1) * char_size;
1119
Victor Stinner7a9105a2011-12-12 00:13:42 +01001120 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1121 {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001122 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinner7a9105a2011-12-12 00:13:42 +01001123 _PyUnicode_UTF8(unicode) = NULL;
1124 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1125 }
1126
Victor Stinner32bd68c2020-12-01 10:37:39 +01001127 data = (PyObject *)PyObject_Realloc(data, new_size);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001128 if (data == NULL) {
1129 PyErr_NoMemory();
1130 return -1;
1131 }
1132 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001133 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001134 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001135 _PyUnicode_WSTR_LENGTH(unicode) = length;
1136 }
1137 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001138 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001139 _PyUnicode_UTF8_LENGTH(unicode) = length;
1140 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001141 _PyUnicode_LENGTH(unicode) = length;
1142 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001143#ifdef Py_DEBUG
1144 unicode_fill_invalid(unicode, old_length);
1145#endif
Victor Stinner95663112011-10-04 01:03:50 +02001146 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001147 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001148 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001150 }
Victor Stinner95663112011-10-04 01:03:50 +02001151 assert(_PyUnicode_WSTR(unicode) != NULL);
1152
1153 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001154 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001155 PyErr_NoMemory();
1156 return -1;
1157 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001158 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001159 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001160 wstr = PyObject_Realloc(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001161 if (!wstr) {
1162 PyErr_NoMemory();
1163 return -1;
1164 }
1165 _PyUnicode_WSTR(unicode) = wstr;
1166 _PyUnicode_WSTR(unicode)[length] = 0;
1167 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001168 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 return 0;
1170}
1171
Victor Stinnerfe226c02011-10-03 03:52:20 +02001172static PyObject*
1173resize_copy(PyObject *unicode, Py_ssize_t length)
1174{
1175 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001176 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001177 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001178
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001179 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180
1181 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1182 if (copy == NULL)
1183 return NULL;
1184
1185 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001186 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001187 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001188 }
1189 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001190 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001191
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001192 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001193 if (w == NULL)
1194 return NULL;
1195 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1196 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001197 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001198 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001199 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001200 }
1201}
1202
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001204 Ux0000 terminated; some code (e.g. new_identifier)
1205 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206
1207 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001208 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210*/
1211
Alexander Belopolsky40018472011-02-26 01:02:56 +00001212static PyUnicodeObject *
1213_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001215 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
Thomas Wouters477c8d52006-05-27 19:21:47 +00001218 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001219 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001220 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 }
1222
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001223 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001224 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001225 return (PyUnicodeObject *)PyErr_NoMemory();
1226 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 if (length < 0) {
1228 PyErr_SetString(PyExc_SystemError,
1229 "Negative size passed to _PyUnicode_New");
1230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 }
1232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1234 if (unicode == NULL)
1235 return NULL;
1236 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001237
1238 _PyUnicode_WSTR_LENGTH(unicode) = length;
1239 _PyUnicode_HASH(unicode) = -1;
1240 _PyUnicode_STATE(unicode).interned = 0;
1241 _PyUnicode_STATE(unicode).kind = 0;
1242 _PyUnicode_STATE(unicode).compact = 0;
1243 _PyUnicode_STATE(unicode).ready = 0;
1244 _PyUnicode_STATE(unicode).ascii = 0;
1245 _PyUnicode_DATA_ANY(unicode) = NULL;
1246 _PyUnicode_LENGTH(unicode) = 0;
1247 _PyUnicode_UTF8(unicode) = NULL;
1248 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1249
Victor Stinner32bd68c2020-12-01 10:37:39 +01001250 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001252 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001253 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001254 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001256
Jeremy Hyltond8082792003-09-16 19:41:39 +00001257 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001258 * the caller fails before initializing str -- unicode_resize()
1259 * reads str[0], and the Keep-Alive optimization can keep memory
1260 * allocated for str alive across a call to unicode_dealloc(unicode).
1261 * We don't want unicode_resize to read uninitialized memory in
1262 * that case.
1263 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264 _PyUnicode_WSTR(unicode)[0] = 0;
1265 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001266
Victor Stinner7931d9a2011-11-04 00:22:48 +01001267 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 return unicode;
1269}
1270
Victor Stinnerf42dc442011-10-02 23:33:16 +02001271static const char*
1272unicode_kind_name(PyObject *unicode)
1273{
Victor Stinner42dfd712011-10-03 14:41:45 +02001274 /* don't check consistency: unicode_kind_name() is called from
1275 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001276 if (!PyUnicode_IS_COMPACT(unicode))
1277 {
1278 if (!PyUnicode_IS_READY(unicode))
1279 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001280 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001281 {
1282 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001283 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 return "legacy ascii";
1285 else
1286 return "legacy latin1";
1287 case PyUnicode_2BYTE_KIND:
1288 return "legacy UCS2";
1289 case PyUnicode_4BYTE_KIND:
1290 return "legacy UCS4";
1291 default:
1292 return "<legacy invalid kind>";
1293 }
1294 }
1295 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001296 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001297 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001298 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001299 return "ascii";
1300 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001301 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001302 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001303 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001304 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001305 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001306 default:
1307 return "<invalid compact kind>";
1308 }
1309}
1310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001313const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001314 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001315 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316}
1317
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001318const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001319 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 return _PyUnicode_COMPACT_DATA(unicode);
1321}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001322const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001323 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001324 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1326 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1327 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1328 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1329 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1330 return PyUnicode_DATA(unicode);
1331}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001332
1333void
1334_PyUnicode_Dump(PyObject *op)
1335{
1336 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001337 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1338 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001339 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001340
Victor Stinnera849a4b2011-10-03 12:12:11 +02001341 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001342 {
1343 if (ascii->state.ascii)
1344 data = (ascii + 1);
1345 else
1346 data = (compact + 1);
1347 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001348 else
1349 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001350 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001351
Victor Stinnera849a4b2011-10-03 12:12:11 +02001352 if (ascii->wstr == data)
1353 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001354 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001355
Victor Stinnera3b334d2011-10-03 13:53:37 +02001356 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001357 printf(" (%zu), ", compact->wstr_length);
1358 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001359 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001360 }
1361 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001362 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001363 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001364}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365#endif
1366
Victor Stinner91698d82020-06-25 14:07:40 +02001367static int
1368unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1369{
1370 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1371 // optimized to always use state->empty_string without having to check if
1372 // it is NULL or not.
1373 PyObject *empty = PyUnicode_New(1, 0);
1374 if (empty == NULL) {
1375 return -1;
1376 }
1377 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1378 _PyUnicode_LENGTH(empty) = 0;
1379 assert(_PyUnicode_CheckConsistency(empty, 1));
1380
1381 assert(state->empty_string == NULL);
1382 state->empty_string = empty;
1383 return 0;
1384}
1385
1386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387PyObject *
1388PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1389{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001390 /* Optimization for empty strings */
1391 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001392 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001393 }
1394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 PyObject *obj;
1396 PyCompactUnicodeObject *unicode;
1397 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001398 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001399 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 Py_ssize_t char_size;
1401 Py_ssize_t struct_size;
1402
Victor Stinner9e9d6892011-10-04 01:02:02 +02001403 is_ascii = 0;
1404 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 struct_size = sizeof(PyCompactUnicodeObject);
1406 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001407 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 char_size = 1;
1409 is_ascii = 1;
1410 struct_size = sizeof(PyASCIIObject);
1411 }
1412 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001413 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 char_size = 1;
1415 }
1416 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001417 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 char_size = 2;
1419 if (sizeof(wchar_t) == 2)
1420 is_sharing = 1;
1421 }
1422 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001423 if (maxchar > MAX_UNICODE) {
1424 PyErr_SetString(PyExc_SystemError,
1425 "invalid maximum character passed to PyUnicode_New");
1426 return NULL;
1427 }
Victor Stinner8f825062012-04-27 13:55:39 +02001428 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 char_size = 4;
1430 if (sizeof(wchar_t) == 4)
1431 is_sharing = 1;
1432 }
1433
1434 /* Ensure we won't overflow the size. */
1435 if (size < 0) {
1436 PyErr_SetString(PyExc_SystemError,
1437 "Negative size passed to PyUnicode_New");
1438 return NULL;
1439 }
1440 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1441 return PyErr_NoMemory();
1442
1443 /* Duplicated allocation code from _PyObject_New() instead of a call to
1444 * PyObject_New() so we are able to allocate space for the object and
1445 * it's data buffer.
1446 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001447 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001448 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001450 }
1451 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452
1453 unicode = (PyCompactUnicodeObject *)obj;
1454 if (is_ascii)
1455 data = ((PyASCIIObject*)obj) + 1;
1456 else
1457 data = unicode + 1;
1458 _PyUnicode_LENGTH(unicode) = size;
1459 _PyUnicode_HASH(unicode) = -1;
1460 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001461 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 _PyUnicode_STATE(unicode).compact = 1;
1463 _PyUnicode_STATE(unicode).ready = 1;
1464 _PyUnicode_STATE(unicode).ascii = is_ascii;
1465 if (is_ascii) {
1466 ((char*)data)[size] = 0;
1467 _PyUnicode_WSTR(unicode) = NULL;
1468 }
Victor Stinner8f825062012-04-27 13:55:39 +02001469 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 ((char*)data)[size] = 0;
1471 _PyUnicode_WSTR(unicode) = NULL;
1472 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001474 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 else {
1477 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001478 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001479 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001481 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 ((Py_UCS4*)data)[size] = 0;
1483 if (is_sharing) {
1484 _PyUnicode_WSTR_LENGTH(unicode) = size;
1485 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1486 }
1487 else {
1488 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489 _PyUnicode_WSTR(unicode) = NULL;
1490 }
1491 }
Victor Stinner8f825062012-04-27 13:55:39 +02001492#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001493 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001494#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001495 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 return obj;
1497}
1498
1499#if SIZEOF_WCHAR_T == 2
1500/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1501 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001502 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503
1504 This function assumes that unicode can hold one more code point than wstr
1505 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001506static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001508 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001509{
1510 const wchar_t *iter;
1511 Py_UCS4 *ucs4_out;
1512
Victor Stinner910337b2011-10-03 03:20:16 +02001513 assert(unicode != NULL);
1514 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1516 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1517
1518 for (iter = begin; iter < end; ) {
1519 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1520 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001521 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1522 && (iter+1) < end
1523 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524 {
Victor Stinner551ac952011-11-29 22:58:13 +01001525 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 iter += 2;
1527 }
1528 else {
1529 *ucs4_out++ = *iter;
1530 iter++;
1531 }
1532 }
1533 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1534 _PyUnicode_GET_LENGTH(unicode)));
1535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536}
1537#endif
1538
Victor Stinnercd9950f2011-10-02 00:34:53 +02001539static int
Victor Stinner488fa492011-12-12 00:01:39 +01001540unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001541{
Victor Stinner488fa492011-12-12 00:01:39 +01001542 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001543 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001544 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001545 return -1;
1546 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001547 return 0;
1548}
1549
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550static int
1551_copy_characters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001555 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001556 const void *from_data;
1557 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558
Victor Stinneree4544c2012-05-09 22:24:08 +02001559 assert(0 <= how_many);
1560 assert(0 <= from_start);
1561 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001562 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001564 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565
Victor Stinnerd3f08822012-05-29 12:57:52 +02001566 assert(PyUnicode_Check(to));
1567 assert(PyUnicode_IS_READY(to));
1568 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1569
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001570 if (how_many == 0)
1571 return 0;
1572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001574 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001576 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577
Victor Stinnerf1852262012-06-16 16:38:26 +02001578#ifdef Py_DEBUG
1579 if (!check_maxchar
1580 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1581 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001582 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001583 Py_UCS4 ch;
1584 Py_ssize_t i;
1585 for (i=0; i < how_many; i++) {
1586 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1587 assert(ch <= to_maxchar);
1588 }
1589 }
1590#endif
1591
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001592 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001593 if (check_maxchar
1594 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1595 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001596 /* Writing Latin-1 characters into an ASCII string requires to
1597 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001598 Py_UCS4 max_char;
1599 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001600 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001601 if (max_char >= 128)
1602 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001603 }
Christian Heimesf051e432016-09-13 20:22:02 +02001604 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001605 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001606 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001608 else if (from_kind == PyUnicode_1BYTE_KIND
1609 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001610 {
1611 _PyUnicode_CONVERT_BYTES(
1612 Py_UCS1, Py_UCS2,
1613 PyUnicode_1BYTE_DATA(from) + from_start,
1614 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1615 PyUnicode_2BYTE_DATA(to) + to_start
1616 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001617 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001618 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001619 && to_kind == PyUnicode_4BYTE_KIND)
1620 {
1621 _PyUnicode_CONVERT_BYTES(
1622 Py_UCS1, Py_UCS4,
1623 PyUnicode_1BYTE_DATA(from) + from_start,
1624 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1625 PyUnicode_4BYTE_DATA(to) + to_start
1626 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 }
1628 else if (from_kind == PyUnicode_2BYTE_KIND
1629 && to_kind == PyUnicode_4BYTE_KIND)
1630 {
1631 _PyUnicode_CONVERT_BYTES(
1632 Py_UCS2, Py_UCS4,
1633 PyUnicode_2BYTE_DATA(from) + from_start,
1634 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1635 PyUnicode_4BYTE_DATA(to) + to_start
1636 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001637 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001638 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001639 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1640
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001641 if (!check_maxchar) {
1642 if (from_kind == PyUnicode_2BYTE_KIND
1643 && to_kind == PyUnicode_1BYTE_KIND)
1644 {
1645 _PyUnicode_CONVERT_BYTES(
1646 Py_UCS2, Py_UCS1,
1647 PyUnicode_2BYTE_DATA(from) + from_start,
1648 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1649 PyUnicode_1BYTE_DATA(to) + to_start
1650 );
1651 }
1652 else if (from_kind == PyUnicode_4BYTE_KIND
1653 && to_kind == PyUnicode_1BYTE_KIND)
1654 {
1655 _PyUnicode_CONVERT_BYTES(
1656 Py_UCS4, Py_UCS1,
1657 PyUnicode_4BYTE_DATA(from) + from_start,
1658 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1659 PyUnicode_1BYTE_DATA(to) + to_start
1660 );
1661 }
1662 else if (from_kind == PyUnicode_4BYTE_KIND
1663 && to_kind == PyUnicode_2BYTE_KIND)
1664 {
1665 _PyUnicode_CONVERT_BYTES(
1666 Py_UCS4, Py_UCS2,
1667 PyUnicode_4BYTE_DATA(from) + from_start,
1668 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1669 PyUnicode_2BYTE_DATA(to) + to_start
1670 );
1671 }
1672 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001673 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001674 }
1675 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001676 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001677 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001678 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001679 Py_ssize_t i;
1680
Victor Stinnera0702ab2011-09-29 14:14:38 +02001681 for (i=0; i < how_many; i++) {
1682 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001683 if (ch > to_maxchar)
1684 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001685 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1686 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001687 }
1688 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001689 return 0;
1690}
1691
Victor Stinnerd3f08822012-05-29 12:57:52 +02001692void
1693_PyUnicode_FastCopyCharacters(
1694 PyObject *to, Py_ssize_t to_start,
1695 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001696{
1697 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1698}
1699
1700Py_ssize_t
1701PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1702 PyObject *from, Py_ssize_t from_start,
1703 Py_ssize_t how_many)
1704{
1705 int err;
1706
1707 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1708 PyErr_BadInternalCall();
1709 return -1;
1710 }
1711
Benjamin Petersonbac79492012-01-14 13:34:47 -05001712 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001713 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001714 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001715 return -1;
1716
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001717 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001718 PyErr_SetString(PyExc_IndexError, "string index out of range");
1719 return -1;
1720 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001721 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001722 PyErr_SetString(PyExc_IndexError, "string index out of range");
1723 return -1;
1724 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001725 if (how_many < 0) {
1726 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1727 return -1;
1728 }
1729 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001730 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1731 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001732 "Cannot write %zi characters at %zi "
1733 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001734 how_many, to_start, PyUnicode_GET_LENGTH(to));
1735 return -1;
1736 }
1737
1738 if (how_many == 0)
1739 return 0;
1740
Victor Stinner488fa492011-12-12 00:01:39 +01001741 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 return -1;
1743
1744 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1745 if (err) {
1746 PyErr_Format(PyExc_SystemError,
1747 "Cannot copy %s characters "
1748 "into a string of %s characters",
1749 unicode_kind_name(from),
1750 unicode_kind_name(to));
1751 return -1;
1752 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001753 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754}
1755
Victor Stinner17222162011-09-28 22:15:37 +02001756/* Find the maximum code point and count the number of surrogate pairs so a
1757 correct string length can be computed before converting a string to UCS4.
1758 This function counts single surrogates as a character and not as a pair.
1759
1760 Return 0 on success, or -1 on error. */
1761static int
1762find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1763 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764{
1765 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001766 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767
Victor Stinnerc53be962011-10-02 21:33:54 +02001768 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 *num_surrogates = 0;
1770 *maxchar = 0;
1771
1772 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001774 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1775 && (iter+1) < end
1776 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1777 {
1778 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1779 ++(*num_surrogates);
1780 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 }
1782 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001784 {
1785 ch = *iter;
1786 iter++;
1787 }
1788 if (ch > *maxchar) {
1789 *maxchar = ch;
1790 if (*maxchar > MAX_UNICODE) {
1791 PyErr_Format(PyExc_ValueError,
Victor Stinner99768342021-03-17 21:46:53 +01001792 "character U+%x is not in range [U+0000; U+%x]",
1793 ch, MAX_UNICODE);
Victor Stinner8faf8212011-12-08 22:14:11 +01001794 return -1;
1795 }
1796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 }
1798 return 0;
1799}
1800
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001801int
1802_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803{
1804 wchar_t *end;
1805 Py_UCS4 maxchar = 0;
1806 Py_ssize_t num_surrogates;
1807#if SIZEOF_WCHAR_T == 2
1808 Py_ssize_t length_wo_surrogates;
1809#endif
1810
Georg Brandl7597add2011-10-05 16:36:47 +02001811 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001812 strings were created using _PyObject_New() and where no canonical
1813 representation (the str field) has been set yet aka strings
1814 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001815 assert(_PyUnicode_CHECK(unicode));
1816 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001818 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001819 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 /* Actually, it should neither be interned nor be anything else: */
1821 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001824 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001825 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827
1828 if (maxchar < 256) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001829 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001830 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 PyErr_NoMemory();
1832 return -1;
1833 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001834 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 _PyUnicode_WSTR(unicode), end,
1836 PyUnicode_1BYTE_DATA(unicode));
1837 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1838 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1839 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1840 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001841 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001842 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001843 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 }
1845 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001846 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001847 _PyUnicode_UTF8(unicode) = NULL;
1848 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001850 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 _PyUnicode_WSTR(unicode) = NULL;
1852 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1853 }
1854 /* In this case we might have to convert down from 4-byte native
1855 wchar_t to 2-byte unicode. */
1856 else if (maxchar < 65536) {
1857 assert(num_surrogates == 0 &&
1858 "FindMaxCharAndNumSurrogatePairs() messed up");
1859
Victor Stinner506f5922011-09-28 22:34:18 +02001860#if SIZEOF_WCHAR_T == 2
1861 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001862 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001863 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1864 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1865 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001866 _PyUnicode_UTF8(unicode) = NULL;
1867 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001868#else
1869 /* sizeof(wchar_t) == 4 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001870 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
Victor Stinner506f5922011-09-28 22:34:18 +02001871 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001872 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001873 PyErr_NoMemory();
1874 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 }
Victor Stinner506f5922011-09-28 22:34:18 +02001876 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1877 _PyUnicode_WSTR(unicode), end,
1878 PyUnicode_2BYTE_DATA(unicode));
1879 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1880 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1881 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001882 _PyUnicode_UTF8(unicode) = NULL;
1883 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner32bd68c2020-12-01 10:37:39 +01001884 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinner506f5922011-09-28 22:34:18 +02001885 _PyUnicode_WSTR(unicode) = NULL;
1886 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1887#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 }
Ikko Ashimine38811d62020-11-10 14:57:34 +09001889 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 else {
1891#if SIZEOF_WCHAR_T == 2
1892 /* in case the native representation is 2-bytes, we need to allocate a
1893 new normalized 4-byte version. */
1894 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001895 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1896 PyErr_NoMemory();
1897 return -1;
1898 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001899 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001900 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 PyErr_NoMemory();
1902 return -1;
1903 }
1904 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1905 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001906 _PyUnicode_UTF8(unicode) = NULL;
1907 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001908 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1909 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001910 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001911 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 _PyUnicode_WSTR(unicode) = NULL;
1913 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1914#else
1915 assert(num_surrogates == 0);
1916
Victor Stinnerc3c74152011-10-02 20:39:55 +02001917 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001919 _PyUnicode_UTF8(unicode) = NULL;
1920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1922#endif
1923 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1924 }
1925 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001926 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927 return 0;
1928}
1929
Alexander Belopolsky40018472011-02-26 01:02:56 +00001930static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001931unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932{
Walter Dörwald16807132007-05-25 13:52:07 +00001933 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001934 case SSTATE_NOT_INTERNED:
1935 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001936
Benjamin Peterson29060642009-01-31 22:14:21 +00001937 case SSTATE_INTERNED_MORTAL:
Victor Stinnerea251802020-12-26 02:58:33 +01001938 {
1939 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner3549ca32020-07-03 16:59:12 +02001940 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1941 references (key and value) which were ignored by
1942 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1943 to prevent calling unicode_dealloc() again. Adjust refcnt after
1944 PyDict_DelItem(). */
1945 assert(Py_REFCNT(unicode) == 0);
1946 Py_SET_REFCNT(unicode, 3);
Victor Stinnerea251802020-12-26 02:58:33 +01001947 if (PyDict_DelItem(state->interned, unicode) != 0) {
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001948 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1949 NULL);
1950 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001951 assert(Py_REFCNT(unicode) == 1);
1952 Py_SET_REFCNT(unicode, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001953 break;
Victor Stinnerea251802020-12-26 02:58:33 +01001954 }
Walter Dörwald16807132007-05-25 13:52:07 +00001955
Benjamin Peterson29060642009-01-31 22:14:21 +00001956 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001957 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1958 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001959
Benjamin Peterson29060642009-01-31 22:14:21 +00001960 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001961 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001962 }
1963
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001965 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001966 }
1967 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001968 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001969 }
1970 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001971 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001974 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975}
1976
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001977#ifdef Py_DEBUG
1978static int
1979unicode_is_singleton(PyObject *unicode)
1980{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001981 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001982 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001983 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001984 }
Victor Stinner607b1022020-05-05 18:50:30 +02001985 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001986 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1987 {
1988 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001989 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001990 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001991 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001992 }
1993 return 0;
1994}
1995#endif
1996
Alexander Belopolsky40018472011-02-26 01:02:56 +00001997static int
Victor Stinner488fa492011-12-12 00:01:39 +01001998unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999{
Victor Stinner488fa492011-12-12 00:01:39 +01002000 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002001 if (Py_REFCNT(unicode) != 1)
2002 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002003 if (_PyUnicode_HASH(unicode) != -1)
2004 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002005 if (PyUnicode_CHECK_INTERNED(unicode))
2006 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002007 if (!PyUnicode_CheckExact(unicode))
2008 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002009#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002010 /* singleton refcount is greater than 1 */
2011 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002012#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002013 return 1;
2014}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002015
Victor Stinnerfe226c02011-10-03 03:52:20 +02002016static int
2017unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2018{
2019 PyObject *unicode;
2020 Py_ssize_t old_length;
2021
2022 assert(p_unicode != NULL);
2023 unicode = *p_unicode;
2024
2025 assert(unicode != NULL);
2026 assert(PyUnicode_Check(unicode));
2027 assert(0 <= length);
2028
Victor Stinner910337b2011-10-03 03:20:16 +02002029 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002030 old_length = PyUnicode_WSTR_LENGTH(unicode);
2031 else
2032 old_length = PyUnicode_GET_LENGTH(unicode);
2033 if (old_length == length)
2034 return 0;
2035
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002036 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002037 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002038 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002039 return 0;
2040 }
2041
Victor Stinner488fa492011-12-12 00:01:39 +01002042 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002043 PyObject *copy = resize_copy(unicode, length);
2044 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002045 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002046 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002047 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002048 }
2049
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002051 PyObject *new_unicode = resize_compact(unicode, length);
2052 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002053 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002054 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002055 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002056 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002057 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002058}
2059
Alexander Belopolsky40018472011-02-26 01:02:56 +00002060int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002061PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002062{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002063 PyObject *unicode;
2064 if (p_unicode == NULL) {
2065 PyErr_BadInternalCall();
2066 return -1;
2067 }
2068 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002069 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002070 {
2071 PyErr_BadInternalCall();
2072 return -1;
2073 }
2074 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002075}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002076
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002077/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002078
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002079 WARNING: The function doesn't copy the terminating null character and
2080 doesn't check the maximum character (may write a latin1 character in an
2081 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002082static void
2083unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2084 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002085{
2086 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002087 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002088 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002089
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002090 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002091 switch (kind) {
2092 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002093#ifdef Py_DEBUG
2094 if (PyUnicode_IS_ASCII(unicode)) {
2095 Py_UCS4 maxchar = ucs1lib_find_max_char(
2096 (const Py_UCS1*)str,
2097 (const Py_UCS1*)str + len);
2098 assert(maxchar < 128);
2099 }
2100#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002101 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002102 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103 }
2104 case PyUnicode_2BYTE_KIND: {
2105 Py_UCS2 *start = (Py_UCS2 *)data + index;
2106 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002107
Victor Stinner184252a2012-06-16 02:57:41 +02002108 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 *ucs2 = (Py_UCS2)*str;
2110
2111 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002112 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002113 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002114 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002115 Py_UCS4 *start = (Py_UCS4 *)data + index;
2116 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002117
Victor Stinner184252a2012-06-16 02:57:41 +02002118 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002119 *ucs4 = (Py_UCS4)*str;
2120
2121 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002122 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002123 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002124 default:
2125 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 }
2127}
2128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002130get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002132 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002133
Victor Stinner2f9ada92020-06-24 02:22:21 +02002134 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002135 if (unicode) {
2136 Py_INCREF(unicode);
2137 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 }
Victor Stinner607b1022020-05-05 18:50:30 +02002139
2140 unicode = PyUnicode_New(1, ch);
2141 if (!unicode) {
2142 return NULL;
2143 }
2144
2145 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2146 assert(_PyUnicode_CheckConsistency(unicode, 1));
2147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002149 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002150 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151}
2152
Victor Stinner985a82a2014-01-03 12:53:47 +01002153static PyObject*
2154unicode_char(Py_UCS4 ch)
2155{
2156 PyObject *unicode;
2157
2158 assert(ch <= MAX_UNICODE);
2159
Victor Stinner2f9ada92020-06-24 02:22:21 +02002160 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002161 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002162 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002163
Victor Stinner985a82a2014-01-03 12:53:47 +01002164 unicode = PyUnicode_New(1, ch);
2165 if (unicode == NULL)
2166 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002167
2168 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2169 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002170 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002171 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002172 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2173 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2174 }
2175 assert(_PyUnicode_CheckConsistency(unicode, 1));
2176 return unicode;
2177}
2178
Alexander Belopolsky40018472011-02-26 01:02:56 +00002179PyObject *
2180PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002182 if (u == NULL) {
2183 if (size > 0) {
2184 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2185 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2186 "use PyUnicode_New() instead", 1) < 0) {
2187 return NULL;
2188 }
2189 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002190 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002191 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002192
2193 if (size < 0) {
2194 PyErr_BadInternalCall();
2195 return NULL;
2196 }
2197
2198 return PyUnicode_FromWideChar(u, size);
2199}
2200
2201PyObject *
2202PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2203{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002204 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 Py_UCS4 maxchar = 0;
2206 Py_ssize_t num_surrogates;
2207
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002208 if (u == NULL && size != 0) {
2209 PyErr_BadInternalCall();
2210 return NULL;
2211 }
2212
2213 if (size == -1) {
2214 size = wcslen(u);
2215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002217 /* If the Unicode data is known at construction time, we can apply
2218 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002221 if (size == 0)
2222 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002223
Jakub Kulík9032cf52021-04-30 15:21:42 +02002224#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2225 /* Oracle Solaris uses non-Unicode internal wchar_t form for
2226 non-Unicode locales and hence needs conversion to UCS-4 first. */
2227 if (_Py_LocaleUsesNonUnicodeWchar()) {
2228 wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2229 if (!converted) {
2230 return NULL;
2231 }
2232 PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2233 PyMem_Free(converted);
2234 return unicode;
2235 }
2236#endif
2237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 /* Single character Unicode objects in the Latin-1 range are
2239 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002240 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 return get_latin1_char((unsigned char)*u);
2242
2243 /* If not empty and not single character, copy the Unicode data
2244 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002245 if (find_maxchar_surrogates(u, u + size,
2246 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 return NULL;
2248
Victor Stinner8faf8212011-12-08 22:14:11 +01002249 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 if (!unicode)
2251 return NULL;
2252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 switch (PyUnicode_KIND(unicode)) {
2254 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002255 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2257 break;
2258 case PyUnicode_2BYTE_KIND:
2259#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002260 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002262 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2264#endif
2265 break;
2266 case PyUnicode_4BYTE_KIND:
2267#if SIZEOF_WCHAR_T == 2
2268 /* This is the only case which has to process surrogates, thus
2269 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002270 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271#else
2272 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002273 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274#endif
2275 break;
2276 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002277 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002280 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281}
2282
Alexander Belopolsky40018472011-02-26 01:02:56 +00002283PyObject *
2284PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002285{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002286 if (size < 0) {
2287 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002288 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002289 return NULL;
2290 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002291 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002292 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002293 }
2294 else {
2295 if (size > 0) {
2296 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2297 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2298 "use PyUnicode_New() instead", 1) < 0) {
2299 return NULL;
2300 }
2301 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002302 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002303 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002304}
2305
Alexander Belopolsky40018472011-02-26 01:02:56 +00002306PyObject *
2307PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002308{
2309 size_t size = strlen(u);
2310 if (size > PY_SSIZE_T_MAX) {
2311 PyErr_SetString(PyExc_OverflowError, "input too long");
2312 return NULL;
2313 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002314 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002315}
2316
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002317
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002318PyObject *
2319_PyUnicode_FromId(_Py_Identifier *id)
2320{
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002321 PyInterpreterState *interp = _PyInterpreterState_GET();
2322 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2323
Pablo Galindoa6d63a22020-12-29 00:28:09 +00002324 Py_ssize_t index = _Py_atomic_size_get(&id->index);
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002325 if (index < 0) {
2326 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2327
2328 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2329 // Check again to detect concurrent access. Another thread can have
2330 // initialized the index while this thread waited for the lock.
2331 index = _Py_atomic_size_get(&id->index);
2332 if (index < 0) {
2333 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2334 index = rt_ids->next_index;
2335 rt_ids->next_index++;
2336 _Py_atomic_size_set(&id->index, index);
2337 }
2338 PyThread_release_lock(rt_ids->lock);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002339 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002340 assert(index >= 0);
Victor Stinner297257f2020-06-02 14:39:45 +02002341
2342 PyObject *obj;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002343 if (index < ids->size) {
2344 obj = ids->array[index];
2345 if (obj) {
2346 // Return a borrowed reference
2347 return obj;
2348 }
2349 }
2350
2351 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
Victor Stinner297257f2020-06-02 14:39:45 +02002352 NULL, NULL);
2353 if (!obj) {
2354 return NULL;
2355 }
2356 PyUnicode_InternInPlace(&obj);
2357
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002358 if (index >= ids->size) {
2359 // Overallocate to reduce the number of realloc
2360 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2361 Py_ssize_t item_size = sizeof(ids->array[0]);
2362 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2363 if (new_array == NULL) {
2364 PyErr_NoMemory();
2365 return NULL;
2366 }
2367 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2368 ids->array = new_array;
2369 ids->size = new_size;
2370 }
2371
2372 // The array stores a strong reference
2373 ids->array[index] = obj;
2374
2375 // Return a borrowed reference
2376 return obj;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002377}
2378
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002379
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002380static void
Victor Stinnerf4507232020-12-26 20:26:08 +01002381unicode_clear_identifiers(struct _Py_unicode_state *state)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002382{
Victor Stinnerf4507232020-12-26 20:26:08 +01002383 struct _Py_unicode_ids *ids = &state->ids;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002384 for (Py_ssize_t i=0; i < ids->size; i++) {
2385 Py_XDECREF(ids->array[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002386 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002387 ids->size = 0;
2388 PyMem_Free(ids->array);
2389 ids->array = NULL;
2390 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2391 // after Py_Finalize().
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002392}
2393
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002394
Benjamin Peterson0df54292012-03-26 14:50:32 -04002395/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002396
Victor Stinnerd3f08822012-05-29 12:57:52 +02002397PyObject*
2398_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002399{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002400 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002401 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002402 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002403#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002404 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002405#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002406 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002407 }
Victor Stinner785938e2011-12-11 20:09:03 +01002408 unicode = PyUnicode_New(size, 127);
2409 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002410 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002411 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2412 assert(_PyUnicode_CheckConsistency(unicode, 1));
2413 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002414}
2415
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002416static Py_UCS4
2417kind_maxchar_limit(unsigned int kind)
2418{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002419 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002420 case PyUnicode_1BYTE_KIND:
2421 return 0x80;
2422 case PyUnicode_2BYTE_KIND:
2423 return 0x100;
2424 case PyUnicode_4BYTE_KIND:
2425 return 0x10000;
2426 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002427 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002428 }
2429}
2430
Victor Stinner702c7342011-10-05 13:50:52 +02002431static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002432_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002435 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002436
Victor Stinner2f9ada92020-06-24 02:22:21 +02002437 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002438 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002439 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002440 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002441 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002442 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002443 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002444
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002445 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002446 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 if (!res)
2448 return NULL;
2449 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002450 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002452}
2453
Victor Stinnere57b1c02011-09-28 22:20:48 +02002454static PyObject*
2455_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456{
2457 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002458 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002459
Serhiy Storchaka678db842013-01-26 12:16:36 +02002460 if (size == 0)
2461 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002462 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002463 if (size == 1)
2464 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002465
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002466 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002467 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 if (!res)
2469 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002470 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002472 else {
2473 _PyUnicode_CONVERT_BYTES(
2474 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2475 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002476 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 return res;
2478}
2479
Victor Stinnere57b1c02011-09-28 22:20:48 +02002480static PyObject*
2481_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002482{
2483 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002484 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002485
Serhiy Storchaka678db842013-01-26 12:16:36 +02002486 if (size == 0)
2487 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002488 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002489 if (size == 1)
2490 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002491
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002492 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002493 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 if (!res)
2495 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002496 if (max_char < 256)
2497 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2498 PyUnicode_1BYTE_DATA(res));
2499 else if (max_char < 0x10000)
2500 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2501 PyUnicode_2BYTE_DATA(res));
2502 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002504 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 return res;
2506}
2507
2508PyObject*
2509PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2510{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002511 if (size < 0) {
2512 PyErr_SetString(PyExc_ValueError, "size must be positive");
2513 return NULL;
2514 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002515 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002517 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002519 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002521 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002522 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002523 PyErr_SetString(PyExc_SystemError, "invalid kind");
2524 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526}
2527
Victor Stinnerece58de2012-04-23 23:36:38 +02002528Py_UCS4
2529_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2530{
2531 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002532 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002533
2534 assert(PyUnicode_IS_READY(unicode));
2535 assert(0 <= start);
2536 assert(end <= PyUnicode_GET_LENGTH(unicode));
2537 assert(start <= end);
2538
2539 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2540 return PyUnicode_MAX_CHAR_VALUE(unicode);
2541
2542 if (start == end)
2543 return 127;
2544
Victor Stinner94d558b2012-04-27 22:26:58 +02002545 if (PyUnicode_IS_ASCII(unicode))
2546 return 127;
2547
Victor Stinnerece58de2012-04-23 23:36:38 +02002548 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002549 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002550 endptr = (char *)startptr + end * kind;
2551 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002552 switch(kind) {
2553 case PyUnicode_1BYTE_KIND:
2554 return ucs1lib_find_max_char(startptr, endptr);
2555 case PyUnicode_2BYTE_KIND:
2556 return ucs2lib_find_max_char(startptr, endptr);
2557 case PyUnicode_4BYTE_KIND:
2558 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002559 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002560 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002561 }
2562}
2563
Victor Stinner25a4b292011-10-06 12:31:55 +02002564/* Ensure that a string uses the most efficient storage, if it is not the
2565 case: create a new string with of the right kind. Write NULL into *p_unicode
2566 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002567static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002568unicode_adjust_maxchar(PyObject **p_unicode)
2569{
2570 PyObject *unicode, *copy;
2571 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002572 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002573 unsigned int kind;
2574
2575 assert(p_unicode != NULL);
2576 unicode = *p_unicode;
2577 assert(PyUnicode_IS_READY(unicode));
2578 if (PyUnicode_IS_ASCII(unicode))
2579 return;
2580
2581 len = PyUnicode_GET_LENGTH(unicode);
2582 kind = PyUnicode_KIND(unicode);
2583 if (kind == PyUnicode_1BYTE_KIND) {
2584 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002585 max_char = ucs1lib_find_max_char(u, u + len);
2586 if (max_char >= 128)
2587 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002588 }
2589 else if (kind == PyUnicode_2BYTE_KIND) {
2590 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002591 max_char = ucs2lib_find_max_char(u, u + len);
2592 if (max_char >= 256)
2593 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002594 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002595 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002596 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002597 max_char = ucs4lib_find_max_char(u, u + len);
2598 if (max_char >= 0x10000)
2599 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002600 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002601 else
2602 Py_UNREACHABLE();
2603
Victor Stinner25a4b292011-10-06 12:31:55 +02002604 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002605 if (copy != NULL)
2606 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002607 Py_DECREF(unicode);
2608 *p_unicode = copy;
2609}
2610
Victor Stinner034f6cf2011-09-30 02:26:44 +02002611PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002612_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002613{
Victor Stinner87af4f22011-11-21 23:03:47 +01002614 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002615 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002616
Victor Stinner034f6cf2011-09-30 02:26:44 +02002617 if (!PyUnicode_Check(unicode)) {
2618 PyErr_BadInternalCall();
2619 return NULL;
2620 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002621 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002622 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002623
Victor Stinner87af4f22011-11-21 23:03:47 +01002624 length = PyUnicode_GET_LENGTH(unicode);
2625 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002626 if (!copy)
2627 return NULL;
2628 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2629
Christian Heimesf051e432016-09-13 20:22:02 +02002630 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002631 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002632 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002633 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002634}
2635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636
Victor Stinnerbc603d12011-10-02 01:00:40 +02002637/* Widen Unicode objects to larger buffers. Don't write terminating null
2638 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002640static void*
2641unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002643 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002644
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002645 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002646 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002647 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002648 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002649 if (!result)
2650 return PyErr_NoMemory();
2651 assert(skind == PyUnicode_1BYTE_KIND);
2652 _PyUnicode_CONVERT_BYTES(
2653 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002654 (const Py_UCS1 *)data,
2655 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002656 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002658 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002659 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002660 if (!result)
2661 return PyErr_NoMemory();
2662 if (skind == PyUnicode_2BYTE_KIND) {
2663 _PyUnicode_CONVERT_BYTES(
2664 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002665 (const Py_UCS2 *)data,
2666 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002667 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002669 else {
2670 assert(skind == PyUnicode_1BYTE_KIND);
2671 _PyUnicode_CONVERT_BYTES(
2672 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002673 (const Py_UCS1 *)data,
2674 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002675 result);
2676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002677 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002678 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002679 Py_UNREACHABLE();
2680 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682}
2683
2684static Py_UCS4*
2685as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2686 int copy_null)
2687{
2688 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002689 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 Py_ssize_t len, targetlen;
2691 if (PyUnicode_READY(string) == -1)
2692 return NULL;
2693 kind = PyUnicode_KIND(string);
2694 data = PyUnicode_DATA(string);
2695 len = PyUnicode_GET_LENGTH(string);
2696 targetlen = len;
2697 if (copy_null)
2698 targetlen++;
2699 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002700 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701 if (!target) {
2702 PyErr_NoMemory();
2703 return NULL;
2704 }
2705 }
2706 else {
2707 if (targetsize < targetlen) {
2708 PyErr_Format(PyExc_SystemError,
2709 "string is longer than the buffer");
2710 if (copy_null && 0 < targetsize)
2711 target[0] = 0;
2712 return NULL;
2713 }
2714 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002715 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002716 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002717 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002719 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002720 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002721 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2722 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002723 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002724 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002725 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002726 else {
2727 Py_UNREACHABLE();
2728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 if (copy_null)
2730 target[len] = 0;
2731 return target;
2732}
2733
2734Py_UCS4*
2735PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2736 int copy_null)
2737{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002738 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 PyErr_BadInternalCall();
2740 return NULL;
2741 }
2742 return as_ucs4(string, target, targetsize, copy_null);
2743}
2744
2745Py_UCS4*
2746PyUnicode_AsUCS4Copy(PyObject *string)
2747{
2748 return as_ucs4(string, NULL, 0, 1);
2749}
2750
Victor Stinner15a11362012-10-06 23:48:20 +02002751/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002752 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2753 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2754#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002755
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002756static int
2757unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2758 Py_ssize_t width, Py_ssize_t precision)
2759{
2760 Py_ssize_t length, fill, arglen;
2761 Py_UCS4 maxchar;
2762
2763 if (PyUnicode_READY(str) == -1)
2764 return -1;
2765
2766 length = PyUnicode_GET_LENGTH(str);
2767 if ((precision == -1 || precision >= length)
2768 && width <= length)
2769 return _PyUnicodeWriter_WriteStr(writer, str);
2770
2771 if (precision != -1)
2772 length = Py_MIN(precision, length);
2773
2774 arglen = Py_MAX(length, width);
2775 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2776 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2777 else
2778 maxchar = writer->maxchar;
2779
2780 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2781 return -1;
2782
2783 if (width > length) {
2784 fill = width - length;
2785 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2786 return -1;
2787 writer->pos += fill;
2788 }
2789
2790 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2791 str, 0, length);
2792 writer->pos += length;
2793 return 0;
2794}
2795
2796static int
Victor Stinner998b8062018-09-12 00:23:25 +02002797unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002798 Py_ssize_t width, Py_ssize_t precision)
2799{
2800 /* UTF-8 */
2801 Py_ssize_t length;
2802 PyObject *unicode;
2803 int res;
2804
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002805 if (precision == -1) {
2806 length = strlen(str);
2807 }
2808 else {
2809 length = 0;
2810 while (length < precision && str[length]) {
2811 length++;
2812 }
2813 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002814 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2815 if (unicode == NULL)
2816 return -1;
2817
2818 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2819 Py_DECREF(unicode);
2820 return res;
2821}
2822
Victor Stinner96865452011-03-01 23:44:09 +00002823static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002824unicode_fromformat_arg(_PyUnicodeWriter *writer,
2825 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002826{
Victor Stinnere215d962012-10-06 23:03:36 +02002827 const char *p;
2828 Py_ssize_t len;
2829 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002830 Py_ssize_t width;
2831 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002832 int longflag;
2833 int longlongflag;
2834 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002835 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002836
2837 p = f;
2838 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002839 zeropad = 0;
2840 if (*f == '0') {
2841 zeropad = 1;
2842 f++;
2843 }
Victor Stinner96865452011-03-01 23:44:09 +00002844
2845 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002846 width = -1;
2847 if (Py_ISDIGIT((unsigned)*f)) {
2848 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002849 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002850 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002851 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002852 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002853 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002854 return NULL;
2855 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002856 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002857 f++;
2858 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002859 }
2860 precision = -1;
2861 if (*f == '.') {
2862 f++;
2863 if (Py_ISDIGIT((unsigned)*f)) {
2864 precision = (*f - '0');
2865 f++;
2866 while (Py_ISDIGIT((unsigned)*f)) {
2867 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2868 PyErr_SetString(PyExc_ValueError,
2869 "precision too big");
2870 return NULL;
2871 }
2872 precision = (precision * 10) + (*f - '0');
2873 f++;
2874 }
2875 }
Victor Stinner96865452011-03-01 23:44:09 +00002876 if (*f == '%') {
2877 /* "%.3%s" => f points to "3" */
2878 f--;
2879 }
2880 }
2881 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002882 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002883 f--;
2884 }
Victor Stinner96865452011-03-01 23:44:09 +00002885
2886 /* Handle %ld, %lu, %lld and %llu. */
2887 longflag = 0;
2888 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002889 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002890 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002891 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002892 longflag = 1;
2893 ++f;
2894 }
Victor Stinner96865452011-03-01 23:44:09 +00002895 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002896 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002897 longlongflag = 1;
2898 f += 2;
2899 }
Victor Stinner96865452011-03-01 23:44:09 +00002900 }
2901 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002902 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002903 size_tflag = 1;
2904 ++f;
2905 }
Victor Stinnere215d962012-10-06 23:03:36 +02002906
2907 if (f[1] == '\0')
2908 writer->overallocate = 0;
2909
2910 switch (*f) {
2911 case 'c':
2912 {
2913 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002914 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002915 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002916 "character argument not in range(0x110000)");
2917 return NULL;
2918 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002919 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002921 break;
2922 }
2923
2924 case 'i':
2925 case 'd':
2926 case 'u':
2927 case 'x':
2928 {
2929 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002930 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002931 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002932
2933 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002934 if (longflag) {
2935 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2936 }
2937 else if (longlongflag) {
2938 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2939 }
2940 else if (size_tflag) {
2941 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2942 }
2943 else {
2944 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2945 }
Victor Stinnere215d962012-10-06 23:03:36 +02002946 }
2947 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002948 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002949 }
2950 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002951 if (longflag) {
2952 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2953 }
2954 else if (longlongflag) {
2955 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2956 }
2957 else if (size_tflag) {
2958 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2959 }
2960 else {
2961 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2962 }
Victor Stinnere215d962012-10-06 23:03:36 +02002963 }
2964 assert(len >= 0);
2965
Victor Stinnere215d962012-10-06 23:03:36 +02002966 if (precision < len)
2967 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002968
2969 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002970 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2971 return NULL;
2972
Victor Stinnere215d962012-10-06 23:03:36 +02002973 if (width > precision) {
2974 Py_UCS4 fillchar;
2975 fill = width - precision;
2976 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002977 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2978 return NULL;
2979 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002980 }
Victor Stinner15a11362012-10-06 23:48:20 +02002981 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002982 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002983 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2984 return NULL;
2985 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002986 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002987
Victor Stinner4a587072013-11-19 12:54:53 +01002988 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2989 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002990 break;
2991 }
2992
2993 case 'p':
2994 {
2995 char number[MAX_LONG_LONG_CHARS];
2996
2997 len = sprintf(number, "%p", va_arg(*vargs, void*));
2998 assert(len >= 0);
2999
3000 /* %p is ill-defined: ensure leading 0x. */
3001 if (number[1] == 'X')
3002 number[1] = 'x';
3003 else if (number[1] != 'x') {
3004 memmove(number + 2, number,
3005 strlen(number) + 1);
3006 number[0] = '0';
3007 number[1] = 'x';
3008 len += 2;
3009 }
3010
Victor Stinner4a587072013-11-19 12:54:53 +01003011 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003012 return NULL;
3013 break;
3014 }
3015
3016 case 's':
3017 {
3018 /* UTF-8 */
3019 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02003020 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003021 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003022 break;
3023 }
3024
3025 case 'U':
3026 {
3027 PyObject *obj = va_arg(*vargs, PyObject *);
3028 assert(obj && _PyUnicode_CHECK(obj));
3029
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003030 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003031 return NULL;
3032 break;
3033 }
3034
3035 case 'V':
3036 {
3037 PyObject *obj = va_arg(*vargs, PyObject *);
3038 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02003039 if (obj) {
3040 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003041 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003042 return NULL;
3043 }
3044 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003045 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02003046 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003047 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003048 }
3049 break;
3050 }
3051
3052 case 'S':
3053 {
3054 PyObject *obj = va_arg(*vargs, PyObject *);
3055 PyObject *str;
3056 assert(obj);
3057 str = PyObject_Str(obj);
3058 if (!str)
3059 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003060 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003061 Py_DECREF(str);
3062 return NULL;
3063 }
3064 Py_DECREF(str);
3065 break;
3066 }
3067
3068 case 'R':
3069 {
3070 PyObject *obj = va_arg(*vargs, PyObject *);
3071 PyObject *repr;
3072 assert(obj);
3073 repr = PyObject_Repr(obj);
3074 if (!repr)
3075 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003076 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003077 Py_DECREF(repr);
3078 return NULL;
3079 }
3080 Py_DECREF(repr);
3081 break;
3082 }
3083
3084 case 'A':
3085 {
3086 PyObject *obj = va_arg(*vargs, PyObject *);
3087 PyObject *ascii;
3088 assert(obj);
3089 ascii = PyObject_ASCII(obj);
3090 if (!ascii)
3091 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003092 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003093 Py_DECREF(ascii);
3094 return NULL;
3095 }
3096 Py_DECREF(ascii);
3097 break;
3098 }
3099
3100 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003101 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003102 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003103 break;
3104
3105 default:
3106 /* if we stumble upon an unknown formatting code, copy the rest
3107 of the format string to the output string. (we cannot just
3108 skip the code, since there's no way to know what's in the
3109 argument list) */
3110 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003111 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003112 return NULL;
3113 f = p+len;
3114 return f;
3115 }
3116
3117 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003118 return f;
3119}
3120
Walter Dörwaldd2034312007-05-18 16:29:38 +00003121PyObject *
3122PyUnicode_FromFormatV(const char *format, va_list vargs)
3123{
Victor Stinnere215d962012-10-06 23:03:36 +02003124 va_list vargs2;
3125 const char *f;
3126 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003127
Victor Stinner8f674cc2013-04-17 23:02:17 +02003128 _PyUnicodeWriter_Init(&writer);
3129 writer.min_length = strlen(format) + 100;
3130 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003131
Benjamin Peterson0c212142016-09-20 20:39:33 -07003132 // Copy varags to be able to pass a reference to a subfunction.
3133 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003134
3135 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003136 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003137 f = unicode_fromformat_arg(&writer, f, &vargs2);
3138 if (f == NULL)
3139 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003141 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003142 const char *p;
3143 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003144
Victor Stinnere215d962012-10-06 23:03:36 +02003145 p = f;
3146 do
3147 {
3148 if ((unsigned char)*p > 127) {
3149 PyErr_Format(PyExc_ValueError,
3150 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3151 "string, got a non-ASCII byte: 0x%02x",
3152 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003153 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003154 }
3155 p++;
3156 }
3157 while (*p != '\0' && *p != '%');
3158 len = p - f;
3159
3160 if (*p == '\0')
3161 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003162
3163 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003164 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003165
3166 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003167 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003168 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003169 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003170 return _PyUnicodeWriter_Finish(&writer);
3171
3172 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003173 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003174 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003175 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003176}
3177
Walter Dörwaldd2034312007-05-18 16:29:38 +00003178PyObject *
3179PyUnicode_FromFormat(const char *format, ...)
3180{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003181 PyObject* ret;
3182 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003183
3184#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003185 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003186#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003187 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003188#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003189 ret = PyUnicode_FromFormatV(format, vargs);
3190 va_end(vargs);
3191 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003192}
3193
Serhiy Storchakac46db922018-10-23 22:58:24 +03003194static Py_ssize_t
3195unicode_get_widechar_size(PyObject *unicode)
3196{
3197 Py_ssize_t res;
3198
3199 assert(unicode != NULL);
3200 assert(_PyUnicode_CHECK(unicode));
3201
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003202#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchakac46db922018-10-23 22:58:24 +03003203 if (_PyUnicode_WSTR(unicode) != NULL) {
3204 return PyUnicode_WSTR_LENGTH(unicode);
3205 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003206#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003207 assert(PyUnicode_IS_READY(unicode));
3208
3209 res = _PyUnicode_LENGTH(unicode);
3210#if SIZEOF_WCHAR_T == 2
3211 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3212 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3213 const Py_UCS4 *end = s + res;
3214 for (; s < end; ++s) {
3215 if (*s > 0xFFFF) {
3216 ++res;
3217 }
3218 }
3219 }
3220#endif
3221 return res;
3222}
3223
3224static void
3225unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3226{
Serhiy Storchakac46db922018-10-23 22:58:24 +03003227 assert(unicode != NULL);
3228 assert(_PyUnicode_CHECK(unicode));
3229
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003230#if USE_UNICODE_WCHAR_CACHE
3231 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003232 if (wstr != NULL) {
3233 memcpy(w, wstr, size * sizeof(wchar_t));
3234 return;
3235 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003236#else /* USE_UNICODE_WCHAR_CACHE */
3237 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3238 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3239 return;
3240 }
3241#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003242 assert(PyUnicode_IS_READY(unicode));
3243
3244 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3245 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3246 for (; size--; ++s, ++w) {
3247 *w = *s;
3248 }
3249 }
3250 else {
3251#if SIZEOF_WCHAR_T == 4
3252 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3253 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3254 for (; size--; ++s, ++w) {
3255 *w = *s;
3256 }
3257#else
3258 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3259 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3260 for (; size--; ++s, ++w) {
3261 Py_UCS4 ch = *s;
3262 if (ch > 0xFFFF) {
3263 assert(ch <= MAX_UNICODE);
3264 /* encode surrogate pair in this case */
3265 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3266 if (!size--)
3267 break;
3268 *w = Py_UNICODE_LOW_SURROGATE(ch);
3269 }
3270 else {
3271 *w = ch;
3272 }
3273 }
3274#endif
3275 }
3276}
3277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003278#ifdef HAVE_WCHAR_H
3279
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003280/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003281
Victor Stinnerd88d9832011-09-06 02:00:05 +02003282 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003283 character) required to convert the unicode object. Ignore size argument.
3284
Victor Stinnerd88d9832011-09-06 02:00:05 +02003285 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003286 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003287 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003288Py_ssize_t
3289PyUnicode_AsWideChar(PyObject *unicode,
3290 wchar_t *w,
3291 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003292{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003293 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003294
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003295 if (unicode == NULL) {
3296 PyErr_BadInternalCall();
3297 return -1;
3298 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003299 if (!PyUnicode_Check(unicode)) {
3300 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003301 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003302 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003303
3304 res = unicode_get_widechar_size(unicode);
3305 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003306 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003307 }
3308
3309 if (size > res) {
3310 size = res + 1;
3311 }
3312 else {
3313 res = size;
3314 }
3315 unicode_copy_as_widechar(unicode, w, size);
Jakub Kulík9032cf52021-04-30 15:21:42 +02003316
3317#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3318 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3319 non-Unicode locales and hence needs conversion first. */
3320 if (_Py_LocaleUsesNonUnicodeWchar()) {
3321 if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3322 return -1;
3323 }
3324 }
3325#endif
3326
Serhiy Storchakac46db922018-10-23 22:58:24 +03003327 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003328}
3329
Victor Stinner137c34c2010-09-29 10:25:54 +00003330wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003331PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003332 Py_ssize_t *size)
3333{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003334 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003335 Py_ssize_t buflen;
3336
3337 if (unicode == NULL) {
3338 PyErr_BadInternalCall();
3339 return NULL;
3340 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003341 if (!PyUnicode_Check(unicode)) {
3342 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003343 return NULL;
3344 }
3345
Serhiy Storchakac46db922018-10-23 22:58:24 +03003346 buflen = unicode_get_widechar_size(unicode);
3347 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003348 if (buffer == NULL) {
3349 PyErr_NoMemory();
3350 return NULL;
3351 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003352 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
Jakub Kulík9032cf52021-04-30 15:21:42 +02003353
3354#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3355 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3356 non-Unicode locales and hence needs conversion first. */
3357 if (_Py_LocaleUsesNonUnicodeWchar()) {
3358 if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3359 return NULL;
3360 }
3361 }
3362#endif
3363
Serhiy Storchakac46db922018-10-23 22:58:24 +03003364 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003365 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003366 }
3367 else if (wcslen(buffer) != (size_t)buflen) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01003368 PyMem_Free(buffer);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003369 PyErr_SetString(PyExc_ValueError,
3370 "embedded null character");
3371 return NULL;
3372 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003373 return buffer;
3374}
3375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003376#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003378int
3379_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3380{
3381 wchar_t **p = (wchar_t **)ptr;
3382 if (obj == NULL) {
3383#if !USE_UNICODE_WCHAR_CACHE
3384 PyMem_Free(*p);
3385#endif /* USE_UNICODE_WCHAR_CACHE */
3386 *p = NULL;
3387 return 1;
3388 }
3389 if (PyUnicode_Check(obj)) {
3390#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003391 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3392 if (*p == NULL) {
3393 return 0;
3394 }
3395 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003396#else /* USE_UNICODE_WCHAR_CACHE */
3397 *p = PyUnicode_AsWideCharString(obj, NULL);
3398 if (*p == NULL) {
3399 return 0;
3400 }
3401 return Py_CLEANUP_SUPPORTED;
3402#endif /* USE_UNICODE_WCHAR_CACHE */
3403 }
3404 PyErr_Format(PyExc_TypeError,
3405 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003406 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003407 return 0;
3408}
3409
3410int
3411_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3412{
3413 wchar_t **p = (wchar_t **)ptr;
3414 if (obj == NULL) {
3415#if !USE_UNICODE_WCHAR_CACHE
3416 PyMem_Free(*p);
3417#endif /* USE_UNICODE_WCHAR_CACHE */
3418 *p = NULL;
3419 return 1;
3420 }
3421 if (obj == Py_None) {
3422 *p = NULL;
3423 return 1;
3424 }
3425 if (PyUnicode_Check(obj)) {
3426#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003427 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3428 if (*p == NULL) {
3429 return 0;
3430 }
3431 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003432#else /* USE_UNICODE_WCHAR_CACHE */
3433 *p = PyUnicode_AsWideCharString(obj, NULL);
3434 if (*p == NULL) {
3435 return 0;
3436 }
3437 return Py_CLEANUP_SUPPORTED;
3438#endif /* USE_UNICODE_WCHAR_CACHE */
3439 }
3440 PyErr_Format(PyExc_TypeError,
3441 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003442 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003443 return 0;
3444}
3445
Alexander Belopolsky40018472011-02-26 01:02:56 +00003446PyObject *
3447PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003448{
Victor Stinner8faf8212011-12-08 22:14:11 +01003449 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003450 PyErr_SetString(PyExc_ValueError,
3451 "chr() arg not in range(0x110000)");
3452 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003453 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003454
Victor Stinner985a82a2014-01-03 12:53:47 +01003455 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003456}
3457
Alexander Belopolsky40018472011-02-26 01:02:56 +00003458PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003459PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003461 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003462 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003463 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003464 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003465 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003466 Py_INCREF(obj);
3467 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003468 }
3469 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 /* For a Unicode subtype that's not a Unicode object,
3471 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003472 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003473 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003474 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003475 "Can't convert '%.100s' object to str implicitly",
3476 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003477 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003478}
3479
Alexander Belopolsky40018472011-02-26 01:02:56 +00003480PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003481PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003482 const char *encoding,
3483 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003484{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003485 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003486 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003487
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003489 PyErr_BadInternalCall();
3490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003492
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003493 /* Decoding bytes objects is the most common case and should be fast */
3494 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003495 if (PyBytes_GET_SIZE(obj) == 0) {
3496 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3497 return NULL;
3498 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003499 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003500 }
3501 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003502 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3503 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003504 }
3505
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003506 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003507 PyErr_SetString(PyExc_TypeError,
3508 "decoding str is not supported");
3509 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003510 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003511
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003512 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3513 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3514 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003515 "decoding to str: need a bytes-like object, %.80s found",
3516 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003517 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003518 }
Tim Petersced69f82003-09-16 20:30:58 +00003519
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003520 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003521 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003522 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3523 return NULL;
3524 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003525 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003527
Serhiy Storchaka05997252013-01-26 12:14:02 +02003528 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003529 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003530 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531}
3532
Victor Stinnerebe17e02016-10-12 13:57:45 +02003533/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3534 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3535 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003536int
3537_Py_normalize_encoding(const char *encoding,
3538 char *lower,
3539 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003541 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003542 char *l;
3543 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003544 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545
Victor Stinner942889a2016-09-05 15:40:10 -07003546 assert(encoding != NULL);
3547
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003548 e = encoding;
3549 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003550 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003551 punct = 0;
3552 while (1) {
3553 char c = *e;
3554 if (c == 0) {
3555 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003556 }
Victor Stinner942889a2016-09-05 15:40:10 -07003557
3558 if (Py_ISALNUM(c) || c == '.') {
3559 if (punct && l != lower) {
3560 if (l == l_end) {
3561 return 0;
3562 }
3563 *l++ = '_';
3564 }
3565 punct = 0;
3566
3567 if (l == l_end) {
3568 return 0;
3569 }
3570 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003571 }
3572 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003573 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003574 }
Victor Stinner942889a2016-09-05 15:40:10 -07003575
3576 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003577 }
3578 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003579 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003580}
3581
Alexander Belopolsky40018472011-02-26 01:02:56 +00003582PyObject *
3583PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003584 Py_ssize_t size,
3585 const char *encoding,
3586 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003587{
3588 PyObject *buffer = NULL, *unicode;
3589 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003590 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3591
Victor Stinner22eb6892019-06-26 00:51:05 +02003592 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3593 return NULL;
3594 }
3595
Victor Stinnered076ed2019-06-26 01:49:32 +02003596 if (size == 0) {
3597 _Py_RETURN_UNICODE_EMPTY();
3598 }
3599
Victor Stinner942889a2016-09-05 15:40:10 -07003600 if (encoding == NULL) {
3601 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3602 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003603
Fred Drakee4315f52000-05-09 19:53:39 +00003604 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003605 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3606 char *lower = buflower;
3607
3608 /* Fast paths */
3609 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3610 lower += 3;
3611 if (*lower == '_') {
3612 /* Match "utf8" and "utf_8" */
3613 lower++;
3614 }
3615
3616 if (lower[0] == '8' && lower[1] == 0) {
3617 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3618 }
3619 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3620 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3621 }
3622 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3623 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3624 }
3625 }
3626 else {
3627 if (strcmp(lower, "ascii") == 0
3628 || strcmp(lower, "us_ascii") == 0) {
3629 return PyUnicode_DecodeASCII(s, size, errors);
3630 }
Steve Dowercc16be82016-09-08 10:35:16 -07003631 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003632 else if (strcmp(lower, "mbcs") == 0) {
3633 return PyUnicode_DecodeMBCS(s, size, errors);
3634 }
3635 #endif
3636 else if (strcmp(lower, "latin1") == 0
3637 || strcmp(lower, "latin_1") == 0
3638 || strcmp(lower, "iso_8859_1") == 0
3639 || strcmp(lower, "iso8859_1") == 0) {
3640 return PyUnicode_DecodeLatin1(s, size, errors);
3641 }
3642 }
Victor Stinner37296e82010-06-10 13:36:23 +00003643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644
3645 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003646 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003647 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003648 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003649 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 if (buffer == NULL)
3651 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003652 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 if (unicode == NULL)
3654 goto onError;
3655 if (!PyUnicode_Check(unicode)) {
3656 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003657 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003658 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003659 encoding,
3660 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 Py_DECREF(unicode);
3662 goto onError;
3663 }
3664 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003665 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003666
Benjamin Peterson29060642009-01-31 22:14:21 +00003667 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 Py_XDECREF(buffer);
3669 return NULL;
3670}
3671
Alexander Belopolsky40018472011-02-26 01:02:56 +00003672PyObject *
3673PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003674 const char *encoding,
3675 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003676{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003677 if (!PyUnicode_Check(unicode)) {
3678 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003679 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003680 }
3681
Serhiy Storchaka00939072016-10-27 21:05:49 +03003682 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3683 "PyUnicode_AsDecodedObject() is deprecated; "
3684 "use PyCodec_Decode() to decode from str", 1) < 0)
3685 return NULL;
3686
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003687 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003689
3690 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003691 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003692}
3693
Alexander Belopolsky40018472011-02-26 01:02:56 +00003694PyObject *
3695PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003696 const char *encoding,
3697 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003698{
3699 PyObject *v;
3700
3701 if (!PyUnicode_Check(unicode)) {
3702 PyErr_BadArgument();
3703 goto onError;
3704 }
3705
Serhiy Storchaka00939072016-10-27 21:05:49 +03003706 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3707 "PyUnicode_AsDecodedUnicode() is deprecated; "
3708 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3709 return NULL;
3710
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003711 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003713
3714 /* Decode via the codec registry */
3715 v = PyCodec_Decode(unicode, encoding, errors);
3716 if (v == NULL)
3717 goto onError;
3718 if (!PyUnicode_Check(v)) {
3719 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003720 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003721 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003722 encoding,
3723 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003724 Py_DECREF(v);
3725 goto onError;
3726 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003727 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003728
Benjamin Peterson29060642009-01-31 22:14:21 +00003729 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003730 return NULL;
3731}
3732
Alexander Belopolsky40018472011-02-26 01:02:56 +00003733PyObject *
3734PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003735 Py_ssize_t size,
3736 const char *encoding,
3737 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738{
3739 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003740
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003741 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3745 Py_DECREF(unicode);
3746 return v;
3747}
3748
Alexander Belopolsky40018472011-02-26 01:02:56 +00003749PyObject *
3750PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003751 const char *encoding,
3752 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003753{
3754 PyObject *v;
3755
3756 if (!PyUnicode_Check(unicode)) {
3757 PyErr_BadArgument();
3758 goto onError;
3759 }
3760
Serhiy Storchaka00939072016-10-27 21:05:49 +03003761 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3762 "PyUnicode_AsEncodedObject() is deprecated; "
3763 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3764 "or PyCodec_Encode() for generic encoding", 1) < 0)
3765 return NULL;
3766
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003767 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003768 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003769
3770 /* Encode via the codec registry */
3771 v = PyCodec_Encode(unicode, encoding, errors);
3772 if (v == NULL)
3773 goto onError;
3774 return v;
3775
Benjamin Peterson29060642009-01-31 22:14:21 +00003776 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003777 return NULL;
3778}
3779
Victor Stinner1b579672011-12-17 05:47:23 +01003780
Victor Stinner2cba6b82018-01-10 22:46:15 +01003781static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003782unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003783 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003784{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003785 Py_ssize_t wlen;
3786 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3787 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003788 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003789 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003790
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003791 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003792 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003793 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003794 return NULL;
3795 }
3796
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003797 char *str;
3798 size_t error_pos;
3799 const char *reason;
3800 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003801 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003802 PyMem_Free(wstr);
3803
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003804 if (res != 0) {
3805 if (res == -2) {
3806 PyObject *exc;
3807 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3808 "locale", unicode,
3809 (Py_ssize_t)error_pos,
3810 (Py_ssize_t)(error_pos+1),
3811 reason);
3812 if (exc != NULL) {
3813 PyCodec_StrictErrors(exc);
3814 Py_DECREF(exc);
3815 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003816 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003817 else if (res == -3) {
3818 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3819 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003820 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003821 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003822 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003823 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003824 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003825
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003826 PyObject *bytes = PyBytes_FromString(str);
3827 PyMem_RawFree(str);
3828 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003829}
3830
Victor Stinnerad158722010-10-27 00:25:46 +00003831PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003832PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3833{
Victor Stinner709d23d2019-05-02 14:56:30 -04003834 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3835 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003836}
3837
3838PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003839PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003840{
Victor Stinner81a7be32020-04-14 15:14:01 +02003841 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003842 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3843 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003844 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003845 fs_codec->error_handler,
3846 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003847 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003848#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003849 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003850 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003851 fs_codec->encoding,
3852 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003853 }
Victor Stinnerad158722010-10-27 00:25:46 +00003854#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003855 else {
3856 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3857 machinery is not ready and so cannot be used:
3858 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003859 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3860 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003861 assert(filesystem_errors != NULL);
3862 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3863 assert(errors != _Py_ERROR_UNKNOWN);
3864#ifdef _Py_FORCE_UTF8_FS_ENCODING
3865 return unicode_encode_utf8(unicode, errors, NULL);
3866#else
3867 return unicode_encode_locale(unicode, errors, 0);
3868#endif
3869 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003870}
3871
Alexander Belopolsky40018472011-02-26 01:02:56 +00003872PyObject *
3873PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003874 const char *encoding,
3875 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876{
3877 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003878 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003879
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880 if (!PyUnicode_Check(unicode)) {
3881 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003882 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 }
Fred Drakee4315f52000-05-09 19:53:39 +00003884
Victor Stinner22eb6892019-06-26 00:51:05 +02003885 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3886 return NULL;
3887 }
3888
Victor Stinner942889a2016-09-05 15:40:10 -07003889 if (encoding == NULL) {
3890 return _PyUnicode_AsUTF8String(unicode, errors);
3891 }
3892
Fred Drakee4315f52000-05-09 19:53:39 +00003893 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003894 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3895 char *lower = buflower;
3896
3897 /* Fast paths */
3898 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3899 lower += 3;
3900 if (*lower == '_') {
3901 /* Match "utf8" and "utf_8" */
3902 lower++;
3903 }
3904
3905 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003907 }
3908 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3909 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3910 }
3911 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3912 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3913 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003914 }
Victor Stinner942889a2016-09-05 15:40:10 -07003915 else {
3916 if (strcmp(lower, "ascii") == 0
3917 || strcmp(lower, "us_ascii") == 0) {
3918 return _PyUnicode_AsASCIIString(unicode, errors);
3919 }
Steve Dowercc16be82016-09-08 10:35:16 -07003920#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003921 else if (strcmp(lower, "mbcs") == 0) {
3922 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3923 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003924#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003925 else if (strcmp(lower, "latin1") == 0 ||
3926 strcmp(lower, "latin_1") == 0 ||
3927 strcmp(lower, "iso_8859_1") == 0 ||
3928 strcmp(lower, "iso8859_1") == 0) {
3929 return _PyUnicode_AsLatin1String(unicode, errors);
3930 }
3931 }
Victor Stinner37296e82010-06-10 13:36:23 +00003932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933
3934 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003935 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003937 return NULL;
3938
3939 /* The normal path */
3940 if (PyBytes_Check(v))
3941 return v;
3942
3943 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003944 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003945 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003946 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003947
3948 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003949 "encoder %s returned bytearray instead of bytes; "
3950 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003951 encoding);
3952 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003953 Py_DECREF(v);
3954 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003955 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003956
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003957 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3958 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003959 Py_DECREF(v);
3960 return b;
3961 }
3962
3963 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003964 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003965 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003966 encoding,
3967 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003968 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003969 return NULL;
3970}
3971
Alexander Belopolsky40018472011-02-26 01:02:56 +00003972PyObject *
3973PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003974 const char *encoding,
3975 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003976{
3977 PyObject *v;
3978
3979 if (!PyUnicode_Check(unicode)) {
3980 PyErr_BadArgument();
3981 goto onError;
3982 }
3983
Serhiy Storchaka00939072016-10-27 21:05:49 +03003984 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3985 "PyUnicode_AsEncodedUnicode() is deprecated; "
3986 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3987 return NULL;
3988
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003989 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003991
3992 /* Encode via the codec registry */
3993 v = PyCodec_Encode(unicode, encoding, errors);
3994 if (v == NULL)
3995 goto onError;
3996 if (!PyUnicode_Check(v)) {
3997 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003998 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003999 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02004000 encoding,
4001 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00004002 Py_DECREF(v);
4003 goto onError;
4004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 return v;
Tim Petersced69f82003-09-16 20:30:58 +00004006
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 return NULL;
4009}
4010
Victor Stinner2cba6b82018-01-10 22:46:15 +01004011static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04004012unicode_decode_locale(const char *str, Py_ssize_t len,
4013 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004014{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004015 if (str[len] != '\0' || (size_t)len != strlen(str)) {
4016 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004017 return NULL;
4018 }
4019
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004020 wchar_t *wstr;
4021 size_t wlen;
4022 const char *reason;
4023 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04004024 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004025 if (res != 0) {
4026 if (res == -2) {
4027 PyObject *exc;
4028 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4029 "locale", str, len,
4030 (Py_ssize_t)wlen,
4031 (Py_ssize_t)(wlen + 1),
4032 reason);
4033 if (exc != NULL) {
4034 PyCodec_StrictErrors(exc);
4035 Py_DECREF(exc);
4036 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01004037 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02004038 else if (res == -3) {
4039 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4040 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01004041 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004042 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01004043 }
Victor Stinner2f197072011-12-17 07:08:30 +01004044 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01004045 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004046
4047 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4048 PyMem_RawFree(wstr);
4049 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004050}
4051
4052PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01004053PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4054 const char *errors)
4055{
Victor Stinner709d23d2019-05-02 14:56:30 -04004056 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4057 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01004058}
4059
4060PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01004061PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004062{
4063 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04004064 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4065 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004066}
4067
4068
4069PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00004070PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004071 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004072 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4073}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004074
Christian Heimes5894ba72007-11-04 11:43:14 +00004075PyObject*
4076PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4077{
Victor Stinner81a7be32020-04-14 15:14:01 +02004078 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004079 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4080 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004081 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004082 fs_codec->error_handler,
4083 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004084 NULL);
4085 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004086#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004087 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004088 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004089 fs_codec->encoding,
4090 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004091 }
Victor Stinnerad158722010-10-27 00:25:46 +00004092#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004093 else {
4094 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4095 machinery is not ready and so cannot be used:
4096 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004097 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4098 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004099 assert(filesystem_errors != NULL);
4100 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4101 assert(errors != _Py_ERROR_UNKNOWN);
4102#ifdef _Py_FORCE_UTF8_FS_ENCODING
4103 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4104#else
4105 return unicode_decode_locale(s, size, errors, 0);
4106#endif
4107 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004108}
4109
Martin v. Löwis011e8422009-05-05 04:43:17 +00004110
4111int
4112PyUnicode_FSConverter(PyObject* arg, void* addr)
4113{
Brett Cannonec6ce872016-09-06 15:50:29 -07004114 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004115 PyObject *output = NULL;
4116 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004117 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004118 if (arg == NULL) {
4119 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004120 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004121 return 1;
4122 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004123 path = PyOS_FSPath(arg);
4124 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004125 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004126 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004127 if (PyBytes_Check(path)) {
4128 output = path;
4129 }
4130 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4131 output = PyUnicode_EncodeFSDefault(path);
4132 Py_DECREF(path);
4133 if (!output) {
4134 return 0;
4135 }
4136 assert(PyBytes_Check(output));
4137 }
4138
Victor Stinner0ea2a462010-04-30 00:22:08 +00004139 size = PyBytes_GET_SIZE(output);
4140 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004141 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004142 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004143 Py_DECREF(output);
4144 return 0;
4145 }
4146 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004147 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004148}
4149
4150
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004151int
4152PyUnicode_FSDecoder(PyObject* arg, void* addr)
4153{
Brett Cannona5711202016-09-06 19:36:01 -07004154 int is_buffer = 0;
4155 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004156 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004157 if (arg == NULL) {
4158 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004159 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004160 return 1;
4161 }
Brett Cannona5711202016-09-06 19:36:01 -07004162
4163 is_buffer = PyObject_CheckBuffer(arg);
4164 if (!is_buffer) {
4165 path = PyOS_FSPath(arg);
4166 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004167 return 0;
4168 }
Brett Cannona5711202016-09-06 19:36:01 -07004169 }
4170 else {
4171 path = arg;
4172 Py_INCREF(arg);
4173 }
4174
4175 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004176 output = path;
4177 }
4178 else if (PyBytes_Check(path) || is_buffer) {
4179 PyObject *path_bytes = NULL;
4180
4181 if (!PyBytes_Check(path) &&
4182 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004183 "path should be string, bytes, or os.PathLike, not %.200s",
4184 Py_TYPE(arg)->tp_name)) {
4185 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004186 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004187 }
4188 path_bytes = PyBytes_FromObject(path);
4189 Py_DECREF(path);
4190 if (!path_bytes) {
4191 return 0;
4192 }
4193 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4194 PyBytes_GET_SIZE(path_bytes));
4195 Py_DECREF(path_bytes);
4196 if (!output) {
4197 return 0;
4198 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004199 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004200 else {
4201 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004202 "path should be string, bytes, or os.PathLike, not %.200s",
4203 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004204 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004205 return 0;
4206 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004207 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004208 Py_DECREF(output);
4209 return 0;
4210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004211 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004212 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004213 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004214 Py_DECREF(output);
4215 return 0;
4216 }
4217 *(PyObject**)addr = output;
4218 return Py_CLEANUP_SUPPORTED;
4219}
4220
4221
Inada Naoki02a4d572020-02-27 13:48:59 +09004222static int unicode_fill_utf8(PyObject *unicode);
4223
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004224const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004225PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004226{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004227 if (!PyUnicode_Check(unicode)) {
4228 PyErr_BadArgument();
4229 return NULL;
4230 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004231 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004232 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004234 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004235 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004236 return NULL;
4237 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 }
4239
4240 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004241 *psize = PyUnicode_UTF8_LENGTH(unicode);
4242 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004243}
4244
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004245const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004248 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4249}
4250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004251Py_UNICODE *
4252PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004254 if (!PyUnicode_Check(unicode)) {
4255 PyErr_BadArgument();
4256 return NULL;
4257 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004258 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4259 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004260 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004261 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004262 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263
Serhiy Storchakac46db922018-10-23 22:58:24 +03004264 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4265 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4266 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01004269 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
Serhiy Storchakac46db922018-10-23 22:58:24 +03004270 if (w == NULL) {
4271 PyErr_NoMemory();
4272 return NULL;
4273 }
4274 unicode_copy_as_widechar(unicode, w, wlen + 1);
4275 _PyUnicode_WSTR(unicode) = w;
4276 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4277 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004278 }
4279 }
4280 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004281 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004282 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004283}
4284
Inada Naoki2c4928d2020-06-17 20:09:44 +09004285/* Deprecated APIs */
4286
4287_Py_COMP_DIAG_PUSH
4288_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4289
Alexander Belopolsky40018472011-02-26 01:02:56 +00004290Py_UNICODE *
4291PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004293 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294}
4295
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004296const Py_UNICODE *
4297_PyUnicode_AsUnicode(PyObject *unicode)
4298{
4299 Py_ssize_t size;
4300 const Py_UNICODE *wstr;
4301
4302 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4303 if (wstr && wcslen(wstr) != (size_t)size) {
4304 PyErr_SetString(PyExc_ValueError, "embedded null character");
4305 return NULL;
4306 }
4307 return wstr;
4308}
4309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004310
Alexander Belopolsky40018472011-02-26 01:02:56 +00004311Py_ssize_t
4312PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313{
4314 if (!PyUnicode_Check(unicode)) {
4315 PyErr_BadArgument();
4316 goto onError;
4317 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004318 if (_PyUnicode_WSTR(unicode) == NULL) {
4319 if (PyUnicode_AsUnicode(unicode) == NULL)
4320 goto onError;
4321 }
4322 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323
Benjamin Peterson29060642009-01-31 22:14:21 +00004324 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 return -1;
4326}
4327
Inada Naoki2c4928d2020-06-17 20:09:44 +09004328_Py_COMP_DIAG_POP
4329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004330Py_ssize_t
4331PyUnicode_GetLength(PyObject *unicode)
4332{
Victor Stinner07621332012-06-16 04:53:46 +02004333 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004334 PyErr_BadArgument();
4335 return -1;
4336 }
Victor Stinner07621332012-06-16 04:53:46 +02004337 if (PyUnicode_READY(unicode) == -1)
4338 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004339 return PyUnicode_GET_LENGTH(unicode);
4340}
4341
4342Py_UCS4
4343PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4344{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004345 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004346 int kind;
4347
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004348 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004349 PyErr_BadArgument();
4350 return (Py_UCS4)-1;
4351 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004352 if (PyUnicode_READY(unicode) == -1) {
4353 return (Py_UCS4)-1;
4354 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004355 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004356 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004357 return (Py_UCS4)-1;
4358 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004359 data = PyUnicode_DATA(unicode);
4360 kind = PyUnicode_KIND(unicode);
4361 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362}
4363
4364int
4365PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4366{
4367 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004368 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004369 return -1;
4370 }
Victor Stinner488fa492011-12-12 00:01:39 +01004371 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004372 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004373 PyErr_SetString(PyExc_IndexError, "string index out of range");
4374 return -1;
4375 }
Victor Stinner488fa492011-12-12 00:01:39 +01004376 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004377 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004378 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4379 PyErr_SetString(PyExc_ValueError, "character out of range");
4380 return -1;
4381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004382 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4383 index, ch);
4384 return 0;
4385}
4386
Alexander Belopolsky40018472011-02-26 01:02:56 +00004387const char *
4388PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004389{
Victor Stinner42cb4622010-09-01 19:39:01 +00004390 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004391}
4392
Victor Stinner554f3f02010-06-16 23:33:54 +00004393/* create or adjust a UnicodeDecodeError */
4394static void
4395make_decode_exception(PyObject **exceptionObject,
4396 const char *encoding,
4397 const char *input, Py_ssize_t length,
4398 Py_ssize_t startpos, Py_ssize_t endpos,
4399 const char *reason)
4400{
4401 if (*exceptionObject == NULL) {
4402 *exceptionObject = PyUnicodeDecodeError_Create(
4403 encoding, input, length, startpos, endpos, reason);
4404 }
4405 else {
4406 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4407 goto onError;
4408 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4409 goto onError;
4410 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4411 goto onError;
4412 }
4413 return;
4414
4415onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004416 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004417}
4418
Steve Dowercc16be82016-09-08 10:35:16 -07004419#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004420static int
4421widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4422{
4423 if (newsize > *size) {
4424 wchar_t *newbuf = *buf;
4425 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4426 PyErr_NoMemory();
4427 return -1;
4428 }
4429 *buf = newbuf;
4430 }
4431 *size = newsize;
4432 return 0;
4433}
4434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435/* error handling callback helper:
4436 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004437 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 and adjust various state variables.
4439 return 0 on success, -1 on error
4440*/
4441
Alexander Belopolsky40018472011-02-26 01:02:56 +00004442static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443unicode_decode_call_errorhandler_wchar(
4444 const char *errors, PyObject **errorHandler,
4445 const char *encoding, const char *reason,
4446 const char **input, const char **inend, Py_ssize_t *startinpos,
4447 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004448 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004450 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451
4452 PyObject *restuple = NULL;
4453 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004454 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004455 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004456 Py_ssize_t requiredsize;
4457 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004458 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004459 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460
4461 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 *errorHandler = PyCodec_LookupError(errors);
4463 if (*errorHandler == NULL)
4464 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 }
4466
Victor Stinner554f3f02010-06-16 23:33:54 +00004467 make_decode_exception(exceptionObject,
4468 encoding,
4469 *input, *inend - *input,
4470 *startinpos, *endinpos,
4471 reason);
4472 if (*exceptionObject == NULL)
4473 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474
Petr Viktorinffd97532020-02-11 17:46:57 +01004475 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004479 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004482 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004484
4485 /* Copy back the bytes variables, which might have been modified by the
4486 callback */
4487 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4488 if (!inputobj)
4489 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004490 *input = PyBytes_AS_STRING(inputobj);
4491 insize = PyBytes_GET_SIZE(inputobj);
4492 *inend = *input + insize;
4493 /* we can DECREF safely, as the exception has another reference,
4494 so the object won't go away. */
4495 Py_DECREF(inputobj);
4496
4497 if (newpos<0)
4498 newpos = insize+newpos;
4499 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004500 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004501 goto onError;
4502 }
4503
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004504#if USE_UNICODE_WCHAR_CACHE
4505_Py_COMP_DIAG_PUSH
4506_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4507 repwlen = PyUnicode_GetSize(repunicode);
4508 if (repwlen < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004509 goto onError;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004510_Py_COMP_DIAG_POP
4511#else /* USE_UNICODE_WCHAR_CACHE */
4512 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4513 if (repwlen < 0)
4514 goto onError;
4515 repwlen--;
4516#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004517 /* need more space? (at least enough for what we
4518 have+the replacement+the rest of the string (starting
4519 at the new input position), so we won't have to check space
4520 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004521 requiredsize = *outpos;
4522 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4523 goto overflow;
4524 requiredsize += repwlen;
4525 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4526 goto overflow;
4527 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004528 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004530 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004531 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004532 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004534 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004535 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004536 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004537 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004538 *endinpos = newpos;
4539 *inptr = *input + newpos;
4540
4541 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004542 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004543 return 0;
4544
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004545 overflow:
4546 PyErr_SetString(PyExc_OverflowError,
4547 "decoded result is too long for a Python string");
4548
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004549 onError:
4550 Py_XDECREF(restuple);
4551 return -1;
4552}
Steve Dowercc16be82016-09-08 10:35:16 -07004553#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004554
4555static int
4556unicode_decode_call_errorhandler_writer(
4557 const char *errors, PyObject **errorHandler,
4558 const char *encoding, const char *reason,
4559 const char **input, const char **inend, Py_ssize_t *startinpos,
4560 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4561 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4562{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004563 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004564
4565 PyObject *restuple = NULL;
4566 PyObject *repunicode = NULL;
4567 Py_ssize_t insize;
4568 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004569 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004570 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004571 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004572 int need_to_grow = 0;
4573 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004574
4575 if (*errorHandler == NULL) {
4576 *errorHandler = PyCodec_LookupError(errors);
4577 if (*errorHandler == NULL)
4578 goto onError;
4579 }
4580
4581 make_decode_exception(exceptionObject,
4582 encoding,
4583 *input, *inend - *input,
4584 *startinpos, *endinpos,
4585 reason);
4586 if (*exceptionObject == NULL)
4587 goto onError;
4588
Petr Viktorinffd97532020-02-11 17:46:57 +01004589 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004590 if (restuple == NULL)
4591 goto onError;
4592 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004593 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004594 goto onError;
4595 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004596 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004597 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004598
4599 /* Copy back the bytes variables, which might have been modified by the
4600 callback */
4601 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4602 if (!inputobj)
4603 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004604 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004605 *input = PyBytes_AS_STRING(inputobj);
4606 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004607 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004608 /* we can DECREF safely, as the exception has another reference,
4609 so the object won't go away. */
4610 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004611
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004614 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004615 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004617 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618
Victor Stinner170ca6f2013-04-18 00:25:28 +02004619 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004620 if (replen > 1) {
4621 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004622 need_to_grow = 1;
4623 }
4624 new_inptr = *input + newpos;
4625 if (*inend - new_inptr > remain) {
4626 /* We don't know the decoding algorithm here so we make the worst
4627 assumption that one byte decodes to one unicode character.
4628 If unfortunately one byte could decode to more unicode characters,
4629 the decoder may write out-of-bound then. Is it possible for the
4630 algorithms using this function? */
4631 writer->min_length += *inend - new_inptr - remain;
4632 need_to_grow = 1;
4633 }
4634 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004635 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004636 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004637 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4638 goto onError;
4639 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004640 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004641 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004644 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004647 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004648 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004652 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653}
4654
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655/* --- UTF-7 Codec -------------------------------------------------------- */
4656
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657/* See RFC2152 for details. We encode conservatively and decode liberally. */
4658
4659/* Three simple macros defining base-64. */
4660
4661/* Is c a base-64 character? */
4662
4663#define IS_BASE64(c) \
4664 (((c) >= 'A' && (c) <= 'Z') || \
4665 ((c) >= 'a' && (c) <= 'z') || \
4666 ((c) >= '0' && (c) <= '9') || \
4667 (c) == '+' || (c) == '/')
4668
4669/* given that c is a base-64 character, what is its base-64 value? */
4670
4671#define FROM_BASE64(c) \
4672 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4673 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4674 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4675 (c) == '+' ? 62 : 63)
4676
4677/* What is the base-64 character of the bottom 6 bits of n? */
4678
4679#define TO_BASE64(n) \
4680 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4681
4682/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4683 * decoded as itself. We are permissive on decoding; the only ASCII
4684 * byte not decoding to itself is the + which begins a base64
4685 * string. */
4686
4687#define DECODE_DIRECT(c) \
4688 ((c) <= 127 && (c) != '+')
4689
4690/* The UTF-7 encoder treats ASCII characters differently according to
4691 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4692 * the above). See RFC2152. This array identifies these different
4693 * sets:
4694 * 0 : "Set D"
4695 * alphanumeric and '(),-./:?
4696 * 1 : "Set O"
4697 * !"#$%&*;<=>@[]^_`{|}
4698 * 2 : "whitespace"
4699 * ht nl cr sp
4700 * 3 : special (must be base64 encoded)
4701 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4702 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004703
Tim Petersced69f82003-09-16 20:30:58 +00004704static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705char utf7_category[128] = {
4706/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4707 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4708/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4709 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4710/* sp ! " # $ % & ' ( ) * + , - . / */
4711 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4712/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4713 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4714/* @ A B C D E F G H I J K L M N O */
4715 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4716/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4718/* ` a b c d e f g h i j k l m n o */
4719 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4720/* p q r s t u v w x y z { | } ~ del */
4721 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722};
4723
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724/* ENCODE_DIRECT: this character should be encoded as itself. The
4725 * answer depends on whether we are encoding set O as itself, and also
4726 * on whether we are encoding whitespace as itself. RFC2152 makes it
4727 * clear that the answers to these questions vary between
4728 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004729
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730#define ENCODE_DIRECT(c, directO, directWS) \
4731 ((c) < 128 && (c) > 0 && \
4732 ((utf7_category[(c)] == 0) || \
4733 (directWS && (utf7_category[(c)] == 2)) || \
4734 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735
Alexander Belopolsky40018472011-02-26 01:02:56 +00004736PyObject *
4737PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004738 Py_ssize_t size,
4739 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004741 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4742}
4743
Antoine Pitrou244651a2009-05-04 18:56:13 +00004744/* The decoder. The only state we preserve is our read position,
4745 * i.e. how many characters we have consumed. So if we end in the
4746 * middle of a shift sequence we have to back off the read position
4747 * and the output to the beginning of the sequence, otherwise we lose
4748 * all the shift state (seen bits, number of bits seen, high
4749 * surrogate). */
4750
Alexander Belopolsky40018472011-02-26 01:02:56 +00004751PyObject *
4752PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004753 Py_ssize_t size,
4754 const char *errors,
4755 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004756{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004758 Py_ssize_t startinpos;
4759 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004760 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004761 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004762 const char *errmsg = "";
4763 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004764 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 unsigned int base64bits = 0;
4766 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004767 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 PyObject *errorHandler = NULL;
4769 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004771 if (size == 0) {
4772 if (consumed)
4773 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004774 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004775 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004776
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004777 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004778 _PyUnicodeWriter_Init(&writer);
4779 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004780
4781 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004782 e = s + size;
4783
4784 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004785 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004787 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004788
Antoine Pitrou244651a2009-05-04 18:56:13 +00004789 if (inShift) { /* in a base-64 section */
4790 if (IS_BASE64(ch)) { /* consume a base-64 character */
4791 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4792 base64bits += 6;
4793 s++;
4794 if (base64bits >= 16) {
4795 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004796 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 base64bits -= 16;
4798 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004799 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 if (surrogate) {
4801 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004802 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4803 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004804 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004805 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004806 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004807 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808 }
4809 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004810 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004811 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004812 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 }
4814 }
Victor Stinner551ac952011-11-29 22:58:13 +01004815 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 /* first surrogate */
4817 surrogate = outCh;
4818 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004819 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004820 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004821 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 }
4823 }
4824 }
4825 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004826 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827 if (base64bits > 0) { /* left-over bits */
4828 if (base64bits >= 6) {
4829 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004830 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004831 errmsg = "partial character in shift sequence";
4832 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004833 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004834 else {
4835 /* Some bits remain; they should be zero */
4836 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004837 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004838 errmsg = "non-zero padding bits in shift sequence";
4839 goto utf7Error;
4840 }
4841 }
4842 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004843 if (surrogate && DECODE_DIRECT(ch)) {
4844 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4845 goto onError;
4846 }
4847 surrogate = 0;
4848 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004849 /* '-' is absorbed; other terminating
4850 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004851 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004852 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004853 }
4854 }
4855 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004857 s++; /* consume '+' */
4858 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004859 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004860 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004861 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004862 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004863 else if (s < e && !IS_BASE64(*s)) {
4864 s++;
4865 errmsg = "ill-formed sequence";
4866 goto utf7Error;
4867 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004868 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004869 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004870 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004871 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004872 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004873 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004874 }
4875 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004876 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004877 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004878 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004879 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004880 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004881 else {
4882 startinpos = s-starts;
4883 s++;
4884 errmsg = "unexpected special character";
4885 goto utf7Error;
4886 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004887 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004888utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004890 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 errors, &errorHandler,
4892 "utf7", errmsg,
4893 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004894 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004895 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004896 }
4897
Antoine Pitrou244651a2009-05-04 18:56:13 +00004898 /* end of string */
4899
4900 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4901 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004902 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004903 if (surrogate ||
4904 (base64bits >= 6) ||
4905 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004906 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004907 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004908 errors, &errorHandler,
4909 "utf7", "unterminated shift sequence",
4910 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004911 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004912 goto onError;
4913 if (s < e)
4914 goto restart;
4915 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004916 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004917
4918 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004919 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004920 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004921 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004922 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004923 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004924 writer.kind, writer.data, shiftOutStart);
4925 Py_XDECREF(errorHandler);
4926 Py_XDECREF(exc);
4927 _PyUnicodeWriter_Dealloc(&writer);
4928 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004929 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004930 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004931 }
4932 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004933 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004934 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004935 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004936
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937 Py_XDECREF(errorHandler);
4938 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004939 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004940
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004942 Py_XDECREF(errorHandler);
4943 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004944 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004945 return NULL;
4946}
4947
4948
Alexander Belopolsky40018472011-02-26 01:02:56 +00004949PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004950_PyUnicode_EncodeUTF7(PyObject *str,
4951 int base64SetO,
4952 int base64WhiteSpace,
4953 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004954{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004955 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004956 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004957 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004958 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004959 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004960 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004961 unsigned int base64bits = 0;
4962 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004963 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004964 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004965
Benjamin Petersonbac79492012-01-14 13:34:47 -05004966 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004967 return NULL;
4968 kind = PyUnicode_KIND(str);
4969 data = PyUnicode_DATA(str);
4970 len = PyUnicode_GET_LENGTH(str);
4971
4972 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004974
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004975 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004976 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004977 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004978 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004979 if (v == NULL)
4980 return NULL;
4981
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004982 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004983 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004984 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004985
Antoine Pitrou244651a2009-05-04 18:56:13 +00004986 if (inShift) {
4987 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4988 /* shifting out */
4989 if (base64bits) { /* output remaining bits */
4990 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4991 base64buffer = 0;
4992 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004993 }
4994 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004995 /* Characters not in the BASE64 set implicitly unshift the sequence
4996 so no '-' is required, except if the character is itself a '-' */
4997 if (IS_BASE64(ch) || ch == '-') {
4998 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004999 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005000 *out++ = (char) ch;
5001 }
5002 else {
5003 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00005004 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005005 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005006 else { /* not in a shift sequence */
5007 if (ch == '+') {
5008 *out++ = '+';
5009 *out++ = '-';
5010 }
5011 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5012 *out++ = (char) ch;
5013 }
5014 else {
5015 *out++ = '+';
5016 inShift = 1;
5017 goto encode_char;
5018 }
5019 }
5020 continue;
5021encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00005022 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005023 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01005024
Antoine Pitrou244651a2009-05-04 18:56:13 +00005025 /* code first surrogate */
5026 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01005027 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00005028 while (base64bits >= 6) {
5029 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5030 base64bits -= 6;
5031 }
5032 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01005033 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00005034 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005035 base64bits += 16;
5036 base64buffer = (base64buffer << 16) | ch;
5037 while (base64bits >= 6) {
5038 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5039 base64bits -= 6;
5040 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00005041 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005042 if (base64bits)
5043 *out++= TO_BASE64(base64buffer << (6-base64bits) );
5044 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005045 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005046 if (_PyBytes_Resize(&v, out - start) < 0)
5047 return NULL;
5048 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005049}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005050PyObject *
5051PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5052 Py_ssize_t size,
5053 int base64SetO,
5054 int base64WhiteSpace,
5055 const char *errors)
5056{
5057 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005058 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005059 if (tmp == NULL)
5060 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01005061 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005062 base64WhiteSpace, errors);
5063 Py_DECREF(tmp);
5064 return result;
5065}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005066
Antoine Pitrou244651a2009-05-04 18:56:13 +00005067#undef IS_BASE64
5068#undef FROM_BASE64
5069#undef TO_BASE64
5070#undef DECODE_DIRECT
5071#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005072
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073/* --- UTF-8 Codec -------------------------------------------------------- */
5074
Alexander Belopolsky40018472011-02-26 01:02:56 +00005075PyObject *
5076PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005077 Py_ssize_t size,
5078 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079{
Walter Dörwald69652032004-09-07 20:24:22 +00005080 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5081}
5082
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083#include "stringlib/asciilib.h"
5084#include "stringlib/codecs.h"
5085#include "stringlib/undef.h"
5086
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005087#include "stringlib/ucs1lib.h"
5088#include "stringlib/codecs.h"
5089#include "stringlib/undef.h"
5090
5091#include "stringlib/ucs2lib.h"
5092#include "stringlib/codecs.h"
5093#include "stringlib/undef.h"
5094
5095#include "stringlib/ucs4lib.h"
5096#include "stringlib/codecs.h"
5097#include "stringlib/undef.h"
5098
Ma Lina0c603c2020-10-18 22:48:38 +08005099/* Mask to quickly check whether a C 'size_t' contains a
Antoine Pitrouab868312009-01-10 15:40:25 +00005100 non-ASCII, UTF8-encoded char. */
Ma Lina0c603c2020-10-18 22:48:38 +08005101#if (SIZEOF_SIZE_T == 8)
5102# define ASCII_CHAR_MASK 0x8080808080808080ULL
5103#elif (SIZEOF_SIZE_T == 4)
5104# define ASCII_CHAR_MASK 0x80808080U
Antoine Pitrouab868312009-01-10 15:40:25 +00005105#else
Ma Lina0c603c2020-10-18 22:48:38 +08005106# error C 'size_t' size should be either 4 or 8!
Antoine Pitrouab868312009-01-10 15:40:25 +00005107#endif
5108
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005109static Py_ssize_t
5110ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005111{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005112 const char *p = start;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113
Ma Lina0c603c2020-10-18 22:48:38 +08005114#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
Jessica Clarkedec07572021-03-31 11:12:39 +01005115 assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5116 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005117 /* Fast path, see in STRINGLIB(utf8_decode) for
5118 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005119 /* Help allocation */
5120 const char *_p = p;
5121 Py_UCS1 * q = dest;
Jessica Clarkedec07572021-03-31 11:12:39 +01005122 while (_p + SIZEOF_SIZE_T <= end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005123 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005126 *((size_t *)q) = value;
5127 _p += SIZEOF_SIZE_T;
5128 q += SIZEOF_SIZE_T;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005129 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 p = _p;
5131 while (p < end) {
5132 if ((unsigned char)*p & 0x80)
5133 break;
5134 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005138#endif
5139 while (p < end) {
5140 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5141 for an explanation. */
Jessica Clarkedec07572021-03-31 11:12:39 +01005142 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005143 /* Help allocation */
5144 const char *_p = p;
Jessica Clarkedec07572021-03-31 11:12:39 +01005145 while (_p + SIZEOF_SIZE_T <= end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005146 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 if (value & ASCII_CHAR_MASK)
5148 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005149 _p += SIZEOF_SIZE_T;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005150 }
5151 p = _p;
5152 if (_p == end)
5153 break;
5154 }
5155 if ((unsigned char)*p & 0x80)
5156 break;
5157 ++p;
5158 }
5159 memcpy(dest, start, p - start);
5160 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161}
Antoine Pitrouab868312009-01-10 15:40:25 +00005162
Victor Stinner709d23d2019-05-02 14:56:30 -04005163static PyObject *
5164unicode_decode_utf8(const char *s, Py_ssize_t size,
5165 _Py_error_handler error_handler, const char *errors,
5166 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005167{
Victor Stinner785938e2011-12-11 20:09:03 +01005168 if (size == 0) {
5169 if (consumed)
5170 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005171 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005172 }
5173
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005174 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5175 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005176 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005177 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005178 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005179 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005180 }
5181
Inada Naoki770847a2019-06-24 12:30:24 +09005182 const char *starts = s;
5183 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005184
Inada Naoki770847a2019-06-24 12:30:24 +09005185 // fast path: try ASCII string.
5186 PyObject *u = PyUnicode_New(size, 127);
5187 if (u == NULL) {
5188 return NULL;
5189 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005190 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005191 if (s == end) {
5192 return u;
5193 }
5194
5195 // Use _PyUnicodeWriter after fast path is failed.
5196 _PyUnicodeWriter writer;
5197 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5198 writer.pos = s - starts;
5199
5200 Py_ssize_t startinpos, endinpos;
5201 const char *errmsg = "";
5202 PyObject *error_handler_obj = NULL;
5203 PyObject *exc = NULL;
5204
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005205 while (s < end) {
5206 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005207 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005208
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005209 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005210 if (PyUnicode_IS_ASCII(writer.buffer))
5211 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005212 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005213 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005214 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005215 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005216 } else {
5217 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005218 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005219 }
5220
5221 switch (ch) {
5222 case 0:
5223 if (s == end || consumed)
5224 goto End;
5225 errmsg = "unexpected end of data";
5226 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005227 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005228 break;
5229 case 1:
5230 errmsg = "invalid start byte";
5231 startinpos = s - starts;
5232 endinpos = startinpos + 1;
5233 break;
5234 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005235 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5236 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5237 {
5238 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005239 goto End;
5240 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005241 /* fall through */
5242 case 3:
5243 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005244 errmsg = "invalid continuation byte";
5245 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005246 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005247 break;
5248 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005249 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005250 goto onError;
5251 continue;
5252 }
5253
Victor Stinner1d65d912015-10-05 13:43:50 +02005254 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005255 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005256
5257 switch (error_handler) {
5258 case _Py_ERROR_IGNORE:
5259 s += (endinpos - startinpos);
5260 break;
5261
5262 case _Py_ERROR_REPLACE:
5263 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5264 goto onError;
5265 s += (endinpos - startinpos);
5266 break;
5267
5268 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005269 {
5270 Py_ssize_t i;
5271
Victor Stinner1d65d912015-10-05 13:43:50 +02005272 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5273 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005274 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005275 ch = (Py_UCS4)(unsigned char)(starts[i]);
5276 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5277 ch + 0xdc00);
5278 writer.pos++;
5279 }
5280 s += (endinpos - startinpos);
5281 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005282 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005283
5284 default:
5285 if (unicode_decode_call_errorhandler_writer(
5286 errors, &error_handler_obj,
5287 "utf-8", errmsg,
5288 &starts, &end, &startinpos, &endinpos, &exc, &s,
5289 &writer))
5290 goto onError;
5291 }
Victor Stinner785938e2011-12-11 20:09:03 +01005292 }
5293
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005294End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005295 if (consumed)
5296 *consumed = s - starts;
5297
Victor Stinner1d65d912015-10-05 13:43:50 +02005298 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005299 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005300 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005301
5302onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005303 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005304 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005305 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005306 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005307}
5308
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005309
Victor Stinner709d23d2019-05-02 14:56:30 -04005310PyObject *
5311PyUnicode_DecodeUTF8Stateful(const char *s,
5312 Py_ssize_t size,
5313 const char *errors,
5314 Py_ssize_t *consumed)
5315{
5316 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5317}
5318
5319
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005320/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5321 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005322
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005323 On success, write a pointer to a newly allocated wide character string into
5324 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5325 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005326
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005327 On memory allocation failure, return -1.
5328
5329 On decoding error (if surrogateescape is zero), return -2. If wlen is
5330 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5331 is not NULL, write the decoding error message into *reason. */
5332int
5333_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005334 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005335{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005336 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005337 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005338 wchar_t *unicode;
5339 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005340
Victor Stinner3d4226a2018-08-29 22:21:32 +02005341 int surrogateescape = 0;
5342 int surrogatepass = 0;
5343 switch (errors)
5344 {
5345 case _Py_ERROR_STRICT:
5346 break;
5347 case _Py_ERROR_SURROGATEESCAPE:
5348 surrogateescape = 1;
5349 break;
5350 case _Py_ERROR_SURROGATEPASS:
5351 surrogatepass = 1;
5352 break;
5353 default:
5354 return -3;
5355 }
5356
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005357 /* Note: size will always be longer than the resulting Unicode
5358 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005359 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005360 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005361 }
5362
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005363 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005364 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005365 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005366 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005367
5368 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005369 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005370 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005371 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005372 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005373#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005374 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005375#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005376 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005377#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005378 if (ch > 0xFF) {
5379#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005380 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005381#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005382 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005383 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005384 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5385 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5386#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005387 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005388 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005389 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005390 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005391 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005392
5393 if (surrogateescape) {
5394 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5395 }
5396 else {
5397 /* Is it a valid three-byte code? */
5398 if (surrogatepass
5399 && (e - s) >= 3
5400 && (s[0] & 0xf0) == 0xe0
5401 && (s[1] & 0xc0) == 0x80
5402 && (s[2] & 0xc0) == 0x80)
5403 {
5404 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5405 s += 3;
5406 unicode[outpos++] = ch;
5407 }
5408 else {
5409 PyMem_RawFree(unicode );
5410 if (reason != NULL) {
5411 switch (ch) {
5412 case 0:
5413 *reason = "unexpected end of data";
5414 break;
5415 case 1:
5416 *reason = "invalid start byte";
5417 break;
5418 /* 2, 3, 4 */
5419 default:
5420 *reason = "invalid continuation byte";
5421 break;
5422 }
5423 }
5424 if (wlen != NULL) {
5425 *wlen = s - orig_s;
5426 }
5427 return -2;
5428 }
5429 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005430 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005431 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005432 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005433 if (wlen) {
5434 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005435 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005436 *wstr = unicode;
5437 return 0;
5438}
5439
Victor Stinner5f9cf232019-03-19 01:46:25 +01005440
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005441wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005442_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5443 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005444{
5445 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005446 int res = _Py_DecodeUTF8Ex(arg, arglen,
5447 &wstr, wlen,
5448 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005449 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005450 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5451 assert(res != -3);
5452 if (wlen) {
5453 *wlen = (size_t)res;
5454 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005455 return NULL;
5456 }
5457 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005458}
5459
Antoine Pitrouab868312009-01-10 15:40:25 +00005460
Victor Stinnere47e6982017-12-21 15:45:16 +01005461/* UTF-8 encoder using the surrogateescape error handler .
5462
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005463 On success, return 0 and write the newly allocated character string (use
5464 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005465
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005466 On encoding failure, return -2 and write the position of the invalid
5467 surrogate character into *error_pos (if error_pos is set) and the decoding
5468 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005469
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005470 On memory allocation failure, return -1. */
5471int
5472_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005473 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005474{
5475 const Py_ssize_t max_char_size = 4;
5476 Py_ssize_t len = wcslen(text);
5477
5478 assert(len >= 0);
5479
Victor Stinner3d4226a2018-08-29 22:21:32 +02005480 int surrogateescape = 0;
5481 int surrogatepass = 0;
5482 switch (errors)
5483 {
5484 case _Py_ERROR_STRICT:
5485 break;
5486 case _Py_ERROR_SURROGATEESCAPE:
5487 surrogateescape = 1;
5488 break;
5489 case _Py_ERROR_SURROGATEPASS:
5490 surrogatepass = 1;
5491 break;
5492 default:
5493 return -3;
5494 }
5495
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005496 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5497 return -1;
5498 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005499 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005500 if (raw_malloc) {
5501 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005502 }
5503 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005504 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005505 }
5506 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005507 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005508 }
5509
5510 char *p = bytes;
5511 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005512 for (i = 0; i < len; ) {
5513 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005514 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005515 i++;
5516#if Py_UNICODE_SIZE == 2
5517 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5518 && i < len
5519 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5520 {
5521 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5522 i++;
5523 }
5524#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005525
5526 if (ch < 0x80) {
5527 /* Encode ASCII */
5528 *p++ = (char) ch;
5529
5530 }
5531 else if (ch < 0x0800) {
5532 /* Encode Latin-1 */
5533 *p++ = (char)(0xc0 | (ch >> 6));
5534 *p++ = (char)(0x80 | (ch & 0x3f));
5535 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005536 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005537 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005538 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005539 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005540 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005541 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005542 if (reason != NULL) {
5543 *reason = "encoding error";
5544 }
5545 if (raw_malloc) {
5546 PyMem_RawFree(bytes);
5547 }
5548 else {
5549 PyMem_Free(bytes);
5550 }
5551 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005552 }
5553 *p++ = (char)(ch & 0xff);
5554 }
5555 else if (ch < 0x10000) {
5556 *p++ = (char)(0xe0 | (ch >> 12));
5557 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5558 *p++ = (char)(0x80 | (ch & 0x3f));
5559 }
5560 else { /* ch >= 0x10000 */
5561 assert(ch <= MAX_UNICODE);
5562 /* Encode UCS4 Unicode ordinals */
5563 *p++ = (char)(0xf0 | (ch >> 18));
5564 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5565 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5566 *p++ = (char)(0x80 | (ch & 0x3f));
5567 }
5568 }
5569 *p++ = '\0';
5570
5571 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005572 char *bytes2;
5573 if (raw_malloc) {
5574 bytes2 = PyMem_RawRealloc(bytes, final_size);
5575 }
5576 else {
5577 bytes2 = PyMem_Realloc(bytes, final_size);
5578 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005579 if (bytes2 == NULL) {
5580 if (error_pos != NULL) {
5581 *error_pos = (size_t)-1;
5582 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005583 if (raw_malloc) {
5584 PyMem_RawFree(bytes);
5585 }
5586 else {
5587 PyMem_Free(bytes);
5588 }
5589 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005590 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005591 *str = bytes2;
5592 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005593}
5594
5595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005596/* Primary internal function which creates utf8 encoded bytes objects.
5597
5598 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005599 and allocate exactly as much space needed at the end. Else allocate the
5600 maximum possible needed (4 result bytes per Unicode character), and return
5601 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005602*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005603static PyObject *
5604unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5605 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005607 if (!PyUnicode_Check(unicode)) {
5608 PyErr_BadArgument();
5609 return NULL;
5610 }
5611
5612 if (PyUnicode_READY(unicode) == -1)
5613 return NULL;
5614
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005615 if (PyUnicode_UTF8(unicode))
5616 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5617 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005618
Inada Naoki02a4d572020-02-27 13:48:59 +09005619 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005620 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005621 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5622
5623 _PyBytesWriter writer;
5624 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005625
Benjamin Petersonead6b532011-12-20 17:23:42 -06005626 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005627 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005628 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005629 case PyUnicode_1BYTE_KIND:
5630 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5631 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005632 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5633 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005634 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005635 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5636 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005637 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005638 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5639 break;
Tim Peters602f7402002-04-27 18:03:26 +00005640 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005641
5642 if (end == NULL) {
5643 _PyBytesWriter_Dealloc(&writer);
5644 return NULL;
5645 }
5646 return _PyBytesWriter_Finish(&writer, end);
5647}
5648
5649static int
5650unicode_fill_utf8(PyObject *unicode)
5651{
5652 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5653 assert(!PyUnicode_IS_ASCII(unicode));
5654
5655 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005656 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005657 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5658
5659 _PyBytesWriter writer;
5660 char *end;
5661
5662 switch (kind) {
5663 default:
5664 Py_UNREACHABLE();
5665 case PyUnicode_1BYTE_KIND:
5666 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5667 _Py_ERROR_STRICT, NULL);
5668 break;
5669 case PyUnicode_2BYTE_KIND:
5670 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5671 _Py_ERROR_STRICT, NULL);
5672 break;
5673 case PyUnicode_4BYTE_KIND:
5674 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5675 _Py_ERROR_STRICT, NULL);
5676 break;
5677 }
5678 if (end == NULL) {
5679 _PyBytesWriter_Dealloc(&writer);
5680 return -1;
5681 }
5682
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005683 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005684 PyBytes_AS_STRING(writer.buffer);
5685 Py_ssize_t len = end - start;
5686
Victor Stinner32bd68c2020-12-01 10:37:39 +01005687 char *cache = PyObject_Malloc(len + 1);
Inada Naoki02a4d572020-02-27 13:48:59 +09005688 if (cache == NULL) {
5689 _PyBytesWriter_Dealloc(&writer);
5690 PyErr_NoMemory();
5691 return -1;
5692 }
5693 _PyUnicode_UTF8(unicode) = cache;
5694 _PyUnicode_UTF8_LENGTH(unicode) = len;
5695 memcpy(cache, start, len);
5696 cache[len] = '\0';
5697 _PyBytesWriter_Dealloc(&writer);
5698 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699}
5700
Alexander Belopolsky40018472011-02-26 01:02:56 +00005701PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005702_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5703{
5704 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5705}
5706
5707
5708PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5710 Py_ssize_t size,
5711 const char *errors)
5712{
5713 PyObject *v, *unicode;
5714
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005715 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005716 if (unicode == NULL)
5717 return NULL;
5718 v = _PyUnicode_AsUTF8String(unicode, errors);
5719 Py_DECREF(unicode);
5720 return v;
5721}
5722
5723PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005724PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005726 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727}
5728
Walter Dörwald41980ca2007-08-16 21:55:45 +00005729/* --- UTF-32 Codec ------------------------------------------------------- */
5730
5731PyObject *
5732PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 Py_ssize_t size,
5734 const char *errors,
5735 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005736{
5737 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5738}
5739
5740PyObject *
5741PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 Py_ssize_t size,
5743 const char *errors,
5744 int *byteorder,
5745 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005746{
5747 const char *starts = s;
5748 Py_ssize_t startinpos;
5749 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005750 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005751 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005752 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005754 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005755 PyObject *errorHandler = NULL;
5756 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005757
Andy Lestere6be9b52020-02-11 20:28:35 -06005758 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005759 e = q + size;
5760
5761 if (byteorder)
5762 bo = *byteorder;
5763
5764 /* Check for BOM marks (U+FEFF) in the input and adjust current
5765 byte order setting accordingly. In native mode, the leading BOM
5766 mark is skipped, in all other modes, it is copied to the output
5767 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005768 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005769 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005770 if (bom == 0x0000FEFF) {
5771 bo = -1;
5772 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005774 else if (bom == 0xFFFE0000) {
5775 bo = 1;
5776 q += 4;
5777 }
5778 if (byteorder)
5779 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005780 }
5781
Victor Stinnere64322e2012-10-30 23:12:47 +01005782 if (q == e) {
5783 if (consumed)
5784 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005785 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005786 }
5787
Victor Stinnere64322e2012-10-30 23:12:47 +01005788#ifdef WORDS_BIGENDIAN
5789 le = bo < 0;
5790#else
5791 le = bo <= 0;
5792#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005793 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005794
Victor Stinner8f674cc2013-04-17 23:02:17 +02005795 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005796 writer.min_length = (e - q + 3) / 4;
5797 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005798 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005799
Victor Stinnere64322e2012-10-30 23:12:47 +01005800 while (1) {
5801 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005803
Victor Stinnere64322e2012-10-30 23:12:47 +01005804 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005805 enum PyUnicode_Kind kind = writer.kind;
5806 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005807 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005808 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005809 if (le) {
5810 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005811 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005812 if (ch > maxch)
5813 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005814 if (kind != PyUnicode_1BYTE_KIND &&
5815 Py_UNICODE_IS_SURROGATE(ch))
5816 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005817 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005818 q += 4;
5819 } while (q <= last);
5820 }
5821 else {
5822 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005823 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005824 if (ch > maxch)
5825 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005826 if (kind != PyUnicode_1BYTE_KIND &&
5827 Py_UNICODE_IS_SURROGATE(ch))
5828 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005829 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005830 q += 4;
5831 } while (q <= last);
5832 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005833 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005834 }
5835
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005836 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005837 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005838 startinpos = ((const char *)q) - starts;
5839 endinpos = startinpos + 4;
5840 }
5841 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005842 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005844 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005846 startinpos = ((const char *)q) - starts;
5847 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005849 else {
5850 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005851 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005852 goto onError;
5853 q += 4;
5854 continue;
5855 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005856 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005857 startinpos = ((const char *)q) - starts;
5858 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005860
5861 /* The remaining input chars are ignored if the callback
5862 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005863 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005865 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005867 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005869 }
5870
Walter Dörwald41980ca2007-08-16 21:55:45 +00005871 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005873
Walter Dörwald41980ca2007-08-16 21:55:45 +00005874 Py_XDECREF(errorHandler);
5875 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005876 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005879 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005880 Py_XDECREF(errorHandler);
5881 Py_XDECREF(exc);
5882 return NULL;
5883}
5884
5885PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005886_PyUnicode_EncodeUTF32(PyObject *str,
5887 const char *errors,
5888 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005889{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005890 enum PyUnicode_Kind kind;
5891 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005893 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005894 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005895#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005896 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005897#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005898 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005899#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005900 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005901 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005902 PyObject *errorHandler = NULL;
5903 PyObject *exc = NULL;
5904 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005905
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005906 if (!PyUnicode_Check(str)) {
5907 PyErr_BadArgument();
5908 return NULL;
5909 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005910 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 return NULL;
5912 kind = PyUnicode_KIND(str);
5913 data = PyUnicode_DATA(str);
5914 len = PyUnicode_GET_LENGTH(str);
5915
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005916 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005917 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005918 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005919 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005920 if (v == NULL)
5921 return NULL;
5922
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005923 /* output buffer is 4-bytes aligned */
5924 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005925 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005926 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005927 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005928 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005929 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005930
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005931 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005932 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005933 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005934 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005935 else
5936 encoding = "utf-32";
5937
5938 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005939 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5940 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005941 }
5942
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005943 pos = 0;
5944 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005945 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005946
5947 if (kind == PyUnicode_2BYTE_KIND) {
5948 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5949 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005950 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005951 else {
5952 assert(kind == PyUnicode_4BYTE_KIND);
5953 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5954 &out, native_ordering);
5955 }
5956 if (pos == len)
5957 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005958
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005959 rep = unicode_encode_call_errorhandler(
5960 errors, &errorHandler,
5961 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005962 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005963 if (!rep)
5964 goto error;
5965
5966 if (PyBytes_Check(rep)) {
5967 repsize = PyBytes_GET_SIZE(rep);
5968 if (repsize & 3) {
5969 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005970 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005971 "surrogates not allowed");
5972 goto error;
5973 }
5974 moreunits = repsize / 4;
5975 }
5976 else {
5977 assert(PyUnicode_Check(rep));
5978 if (PyUnicode_READY(rep) < 0)
5979 goto error;
5980 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5981 if (!PyUnicode_IS_ASCII(rep)) {
5982 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005983 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005984 "surrogates not allowed");
5985 goto error;
5986 }
5987 }
5988
5989 /* four bytes are reserved for each surrogate */
5990 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005991 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005992 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005993 /* integer overflow */
5994 PyErr_NoMemory();
5995 goto error;
5996 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005997 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005998 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005999 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006000 }
6001
6002 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006003 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006004 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006005 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006006 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006007 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6008 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006009 }
6010
6011 Py_CLEAR(rep);
6012 }
6013
6014 /* Cut back to size actually needed. This is necessary for, for example,
6015 encoding of a string containing isolated surrogates and the 'ignore'
6016 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006017 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006018 if (nsize != PyBytes_GET_SIZE(v))
6019 _PyBytes_Resize(&v, nsize);
6020 Py_XDECREF(errorHandler);
6021 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006022 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006023 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006024 error:
6025 Py_XDECREF(rep);
6026 Py_XDECREF(errorHandler);
6027 Py_XDECREF(exc);
6028 Py_XDECREF(v);
6029 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00006030}
6031
Alexander Belopolsky40018472011-02-26 01:02:56 +00006032PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006033PyUnicode_EncodeUTF32(const Py_UNICODE *s,
6034 Py_ssize_t size,
6035 const char *errors,
6036 int byteorder)
6037{
6038 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006039 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006040 if (tmp == NULL)
6041 return NULL;
6042 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6043 Py_DECREF(tmp);
6044 return result;
6045}
6046
6047PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006048PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00006049{
Victor Stinnerb960b342011-11-20 19:12:52 +01006050 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00006051}
6052
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053/* --- UTF-16 Codec ------------------------------------------------------- */
6054
Tim Peters772747b2001-08-09 22:21:55 +00006055PyObject *
6056PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 Py_ssize_t size,
6058 const char *errors,
6059 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060{
Walter Dörwald69652032004-09-07 20:24:22 +00006061 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6062}
6063
6064PyObject *
6065PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 Py_ssize_t size,
6067 const char *errors,
6068 int *byteorder,
6069 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00006070{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006072 Py_ssize_t startinpos;
6073 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006074 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006075 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006076 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006077 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006078 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 PyObject *errorHandler = NULL;
6080 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006081 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082
Andy Lestere6be9b52020-02-11 20:28:35 -06006083 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006084 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085
6086 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006087 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006089 /* Check for BOM marks (U+FEFF) in the input and adjust current
6090 byte order setting accordingly. In native mode, the leading BOM
6091 mark is skipped, in all other modes, it is copied to the output
6092 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006093 if (bo == 0 && size >= 2) {
6094 const Py_UCS4 bom = (q[1] << 8) | q[0];
6095 if (bom == 0xFEFF) {
6096 q += 2;
6097 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006099 else if (bom == 0xFFFE) {
6100 q += 2;
6101 bo = 1;
6102 }
6103 if (byteorder)
6104 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106
Antoine Pitrou63065d72012-05-15 23:48:04 +02006107 if (q == e) {
6108 if (consumed)
6109 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006110 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006111 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006112
Christian Heimes743e0cd2012-10-17 23:52:17 +02006113#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006114 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006115 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006116#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006117 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006118 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006119#endif
Tim Peters772747b2001-08-09 22:21:55 +00006120
Antoine Pitrou63065d72012-05-15 23:48:04 +02006121 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006122 character count normally. Error handler will take care of
6123 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006124 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006125 writer.min_length = (e - q + 1) / 2;
6126 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006127 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006128
Antoine Pitrou63065d72012-05-15 23:48:04 +02006129 while (1) {
6130 Py_UCS4 ch = 0;
6131 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006132 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006133 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006134 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006135 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006136 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006137 native_ordering);
6138 else
6139 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006140 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006141 native_ordering);
6142 } else if (kind == PyUnicode_2BYTE_KIND) {
6143 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006144 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006145 native_ordering);
6146 } else {
6147 assert(kind == PyUnicode_4BYTE_KIND);
6148 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006149 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006150 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006151 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006152 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153
Antoine Pitrou63065d72012-05-15 23:48:04 +02006154 switch (ch)
6155 {
6156 case 0:
6157 /* remaining byte at the end? (size should be even) */
6158 if (q == e || consumed)
6159 goto End;
6160 errmsg = "truncated data";
6161 startinpos = ((const char *)q) - starts;
6162 endinpos = ((const char *)e) - starts;
6163 break;
6164 /* The remaining input chars are ignored if the callback
6165 chooses to skip the input */
6166 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006167 q -= 2;
6168 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006169 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006170 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006171 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006172 endinpos = ((const char *)e) - starts;
6173 break;
6174 case 2:
6175 errmsg = "illegal encoding";
6176 startinpos = ((const char *)q) - 2 - starts;
6177 endinpos = startinpos + 2;
6178 break;
6179 case 3:
6180 errmsg = "illegal UTF-16 surrogate";
6181 startinpos = ((const char *)q) - 4 - starts;
6182 endinpos = startinpos + 2;
6183 break;
6184 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006185 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006186 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 continue;
6188 }
6189
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006190 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006191 errors,
6192 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006193 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006194 &starts,
6195 (const char **)&e,
6196 &startinpos,
6197 &endinpos,
6198 &exc,
6199 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006200 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 }
6203
Antoine Pitrou63065d72012-05-15 23:48:04 +02006204End:
Walter Dörwald69652032004-09-07 20:24:22 +00006205 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 Py_XDECREF(errorHandler);
6209 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006210 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006213 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 Py_XDECREF(errorHandler);
6215 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 return NULL;
6217}
6218
Tim Peters772747b2001-08-09 22:21:55 +00006219PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006220_PyUnicode_EncodeUTF16(PyObject *str,
6221 const char *errors,
6222 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006224 enum PyUnicode_Kind kind;
6225 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006227 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006228 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006229 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006230#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006231 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006232#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006233 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006234#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006235 const char *encoding;
6236 Py_ssize_t nsize, pos;
6237 PyObject *errorHandler = NULL;
6238 PyObject *exc = NULL;
6239 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006240
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006241 if (!PyUnicode_Check(str)) {
6242 PyErr_BadArgument();
6243 return NULL;
6244 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006245 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006246 return NULL;
6247 kind = PyUnicode_KIND(str);
6248 data = PyUnicode_DATA(str);
6249 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006250
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006251 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006252 if (kind == PyUnicode_4BYTE_KIND) {
6253 const Py_UCS4 *in = (const Py_UCS4 *)data;
6254 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006255 while (in < end) {
6256 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006257 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006258 }
6259 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006260 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006261 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006263 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006264 nsize = len + pairs + (byteorder == 0);
6265 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006266 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006270 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006271 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006272 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006273 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006274 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006275 }
6276 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006277 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006278 }
Tim Peters772747b2001-08-09 22:21:55 +00006279
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006280 if (kind == PyUnicode_1BYTE_KIND) {
6281 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6282 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006283 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006284
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006285 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006286 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006287 }
6288 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006289 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006290 }
6291 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006292 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006293 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006294
6295 pos = 0;
6296 while (pos < len) {
6297 Py_ssize_t repsize, moreunits;
6298
6299 if (kind == PyUnicode_2BYTE_KIND) {
6300 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6301 &out, native_ordering);
6302 }
6303 else {
6304 assert(kind == PyUnicode_4BYTE_KIND);
6305 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6306 &out, native_ordering);
6307 }
6308 if (pos == len)
6309 break;
6310
6311 rep = unicode_encode_call_errorhandler(
6312 errors, &errorHandler,
6313 encoding, "surrogates not allowed",
6314 str, &exc, pos, pos + 1, &pos);
6315 if (!rep)
6316 goto error;
6317
6318 if (PyBytes_Check(rep)) {
6319 repsize = PyBytes_GET_SIZE(rep);
6320 if (repsize & 1) {
6321 raise_encode_exception(&exc, encoding,
6322 str, pos - 1, pos,
6323 "surrogates not allowed");
6324 goto error;
6325 }
6326 moreunits = repsize / 2;
6327 }
6328 else {
6329 assert(PyUnicode_Check(rep));
6330 if (PyUnicode_READY(rep) < 0)
6331 goto error;
6332 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6333 if (!PyUnicode_IS_ASCII(rep)) {
6334 raise_encode_exception(&exc, encoding,
6335 str, pos - 1, pos,
6336 "surrogates not allowed");
6337 goto error;
6338 }
6339 }
6340
6341 /* two bytes are reserved for each surrogate */
6342 if (moreunits > 1) {
6343 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006344 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006345 /* integer overflow */
6346 PyErr_NoMemory();
6347 goto error;
6348 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006349 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006350 goto error;
6351 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6352 }
6353
6354 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006355 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006356 out += moreunits;
6357 } else /* rep is unicode */ {
6358 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6359 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6360 &out, native_ordering);
6361 }
6362
6363 Py_CLEAR(rep);
6364 }
6365
6366 /* Cut back to size actually needed. This is necessary for, for example,
6367 encoding of a string containing isolated surrogates and the 'ignore' handler
6368 is used. */
6369 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6370 if (nsize != PyBytes_GET_SIZE(v))
6371 _PyBytes_Resize(&v, nsize);
6372 Py_XDECREF(errorHandler);
6373 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006374 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006375 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006376 error:
6377 Py_XDECREF(rep);
6378 Py_XDECREF(errorHandler);
6379 Py_XDECREF(exc);
6380 Py_XDECREF(v);
6381 return NULL;
6382#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383}
6384
Alexander Belopolsky40018472011-02-26 01:02:56 +00006385PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006386PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6387 Py_ssize_t size,
6388 const char *errors,
6389 int byteorder)
6390{
6391 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006392 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006393 if (tmp == NULL)
6394 return NULL;
6395 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6396 Py_DECREF(tmp);
6397 return result;
6398}
6399
6400PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006401PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006403 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404}
6405
6406/* --- Unicode Escape Codec ----------------------------------------------- */
6407
Victor Stinner47e1afd2020-10-26 16:43:47 +01006408static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006409
Alexander Belopolsky40018472011-02-26 01:02:56 +00006410PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006411_PyUnicode_DecodeUnicodeEscape(const char *s,
6412 Py_ssize_t size,
6413 const char *errors,
6414 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006417 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 PyObject *errorHandler = NULL;
6420 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006421
Eric V. Smith42454af2016-10-31 09:22:08 -04006422 // so we can remember if we've seen an invalid escape char or not
6423 *first_invalid_escape = NULL;
6424
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006426 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 }
6428 /* Escaped strings will always be longer than the resulting
6429 Unicode string, so we start with size here and then reduce the
6430 length after conversion to the true value.
6431 (but if the error callback returns a long replacement string
6432 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006433 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 writer.min_length = size;
6435 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6436 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006437 }
6438
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 end = s + size;
6440 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 unsigned char c = (unsigned char) *s++;
6442 Py_UCS4 ch;
6443 int count;
6444 Py_ssize_t startinpos;
6445 Py_ssize_t endinpos;
6446 const char *message;
6447
6448#define WRITE_ASCII_CHAR(ch) \
6449 do { \
6450 assert(ch <= 127); \
6451 assert(writer.pos < writer.size); \
6452 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6453 } while(0)
6454
6455#define WRITE_CHAR(ch) \
6456 do { \
6457 if (ch <= writer.maxchar) { \
6458 assert(writer.pos < writer.size); \
6459 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6460 } \
6461 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6462 goto onError; \
6463 } \
6464 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465
6466 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 if (c != '\\') {
6468 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 continue;
6470 }
6471
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006474 if (s >= end) {
6475 message = "\\ at end of string";
6476 goto error;
6477 }
6478 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006479
Victor Stinner62ec3312016-09-06 17:04:34 -07006480 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006481 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 case '\n': continue;
6485 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6486 case '\'': WRITE_ASCII_CHAR('\''); continue;
6487 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6488 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006489 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6491 case 't': WRITE_ASCII_CHAR('\t'); continue;
6492 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6493 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006494 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006495 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006496 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006497 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 case '0': case '1': case '2': case '3':
6501 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006502 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006503 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006504 ch = (ch<<3) + *s++ - '0';
6505 if (s < end && '0' <= *s && *s <= '7') {
6506 ch = (ch<<3) + *s++ - '0';
6507 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006509 WRITE_CHAR(ch);
6510 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 /* hex escapes */
6513 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006515 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006516 message = "truncated \\xXX escape";
6517 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006521 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006522 message = "truncated \\uXXXX escape";
6523 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006526 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006527 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006528 message = "truncated \\UXXXXXXXX escape";
6529 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006530 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006531 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006532 ch <<= 4;
6533 if (c >= '0' && c <= '9') {
6534 ch += c - '0';
6535 }
6536 else if (c >= 'a' && c <= 'f') {
6537 ch += c - ('a' - 10);
6538 }
6539 else if (c >= 'A' && c <= 'F') {
6540 ch += c - ('A' - 10);
6541 }
6542 else {
6543 break;
6544 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006545 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006546 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006547 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006548 }
6549
6550 /* when we get here, ch is a 32-bit unicode character */
6551 if (ch > MAX_UNICODE) {
6552 message = "illegal Unicode character";
6553 goto error;
6554 }
6555
6556 WRITE_CHAR(ch);
6557 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006558
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006560 case 'N':
Victor Stinner47e1afd2020-10-26 16:43:47 +01006561 if (ucnhash_capi == NULL) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006562 /* load the unicode data module */
Victor Stinner47e1afd2020-10-26 16:43:47 +01006563 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006564 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner47e1afd2020-10-26 16:43:47 +01006565 if (ucnhash_capi == NULL) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006566 PyErr_SetString(
6567 PyExc_UnicodeError,
6568 "\\N escapes not supported (can't load unicodedata module)"
6569 );
6570 goto onError;
6571 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006572 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006573
6574 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006575 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006576 const char *start = ++s;
6577 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006578 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006579 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006580 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006581 namelen = s - start;
6582 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006583 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006584 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 ch = 0xffffffff; /* in case 'getcode' messes up */
6586 if (namelen <= INT_MAX &&
Victor Stinner920cb642020-10-26 19:19:36 +01006587 ucnhash_capi->getcode(start, (int)namelen,
Victor Stinner62ec3312016-09-06 17:04:34 -07006588 &ch, 0)) {
6589 assert(ch <= MAX_UNICODE);
6590 WRITE_CHAR(ch);
6591 continue;
6592 }
6593 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006594 }
6595 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006596 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006597
6598 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006599 if (*first_invalid_escape == NULL) {
6600 *first_invalid_escape = s-1; /* Back up one char, since we've
6601 already incremented s. */
6602 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006603 WRITE_ASCII_CHAR('\\');
6604 WRITE_CHAR(c);
6605 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006607
6608 error:
6609 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006610 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006611 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006612 errors, &errorHandler,
6613 "unicodeescape", message,
6614 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006615 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006616 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006617 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006618 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006619
6620#undef WRITE_ASCII_CHAR
6621#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006623
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006624 Py_XDECREF(errorHandler);
6625 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006626 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006627
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006629 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006630 Py_XDECREF(errorHandler);
6631 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 return NULL;
6633}
6634
Eric V. Smith42454af2016-10-31 09:22:08 -04006635PyObject *
6636PyUnicode_DecodeUnicodeEscape(const char *s,
6637 Py_ssize_t size,
6638 const char *errors)
6639{
6640 const char *first_invalid_escape;
6641 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6642 &first_invalid_escape);
6643 if (result == NULL)
6644 return NULL;
6645 if (first_invalid_escape != NULL) {
6646 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6647 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006648 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006649 Py_DECREF(result);
6650 return NULL;
6651 }
6652 }
6653 return result;
6654}
6655
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006656/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006659PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006661 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006662 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006664 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006665 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006666 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667
Ezio Melottie7f90372012-10-05 03:33:31 +03006668 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006669 escape.
6670
Ezio Melottie7f90372012-10-05 03:33:31 +03006671 For UCS1 strings it's '\xxx', 4 bytes per source character.
6672 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6673 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006674 */
6675
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006676 if (!PyUnicode_Check(unicode)) {
6677 PyErr_BadArgument();
6678 return NULL;
6679 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006680 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006681 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006682 }
Victor Stinner358af132015-10-12 22:36:57 +02006683
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006684 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006685 if (len == 0) {
6686 return PyBytes_FromStringAndSize(NULL, 0);
6687 }
6688
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006689 kind = PyUnicode_KIND(unicode);
6690 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006691 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6692 bytes, and 1 byte characters 4. */
6693 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006694 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006695 return PyErr_NoMemory();
6696 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006697 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006698 if (repr == NULL) {
6699 return NULL;
6700 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006701
Victor Stinner62ec3312016-09-06 17:04:34 -07006702 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006703 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006704 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006705
Victor Stinner62ec3312016-09-06 17:04:34 -07006706 /* U+0000-U+00ff range */
6707 if (ch < 0x100) {
6708 if (ch >= ' ' && ch < 127) {
6709 if (ch != '\\') {
6710 /* Copy printable US ASCII as-is */
6711 *p++ = (char) ch;
6712 }
6713 /* Escape backslashes */
6714 else {
6715 *p++ = '\\';
6716 *p++ = '\\';
6717 }
6718 }
Victor Stinner358af132015-10-12 22:36:57 +02006719
Victor Stinner62ec3312016-09-06 17:04:34 -07006720 /* Map special whitespace to '\t', \n', '\r' */
6721 else if (ch == '\t') {
6722 *p++ = '\\';
6723 *p++ = 't';
6724 }
6725 else if (ch == '\n') {
6726 *p++ = '\\';
6727 *p++ = 'n';
6728 }
6729 else if (ch == '\r') {
6730 *p++ = '\\';
6731 *p++ = 'r';
6732 }
6733
6734 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6735 else {
6736 *p++ = '\\';
6737 *p++ = 'x';
6738 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6739 *p++ = Py_hexdigits[ch & 0x000F];
6740 }
Tim Petersced69f82003-09-16 20:30:58 +00006741 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006742 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006743 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 *p++ = '\\';
6745 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006746 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6747 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6748 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6749 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006751 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6752 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006753
Victor Stinner62ec3312016-09-06 17:04:34 -07006754 /* Make sure that the first two digits are zero */
6755 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006756 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006757 *p++ = 'U';
6758 *p++ = '0';
6759 *p++ = '0';
6760 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6761 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6762 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6763 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6764 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6765 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768
Victor Stinner62ec3312016-09-06 17:04:34 -07006769 assert(p - PyBytes_AS_STRING(repr) > 0);
6770 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6771 return NULL;
6772 }
6773 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774}
6775
Alexander Belopolsky40018472011-02-26 01:02:56 +00006776PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006777PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6778 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006780 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006781 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006782 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006784 }
6785
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006786 result = PyUnicode_AsUnicodeEscapeString(tmp);
6787 Py_DECREF(tmp);
6788 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
6791/* --- Raw Unicode Escape Codec ------------------------------------------- */
6792
Alexander Belopolsky40018472011-02-26 01:02:56 +00006793PyObject *
6794PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006795 Py_ssize_t size,
6796 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006798 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006799 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006801 PyObject *errorHandler = NULL;
6802 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006803
Victor Stinner62ec3312016-09-06 17:04:34 -07006804 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006805 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006806 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006807
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 /* Escaped strings will always be longer than the resulting
6809 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006810 length after conversion to the true value. (But decoding error
6811 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006812 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006813 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006814 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6815 goto onError;
6816 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006817
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 end = s + size;
6819 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006820 unsigned char c = (unsigned char) *s++;
6821 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006822 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006823 Py_ssize_t startinpos;
6824 Py_ssize_t endinpos;
6825 const char *message;
6826
6827#define WRITE_CHAR(ch) \
6828 do { \
6829 if (ch <= writer.maxchar) { \
6830 assert(writer.pos < writer.size); \
6831 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6832 } \
6833 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6834 goto onError; \
6835 } \
6836 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006839 if (c != '\\' || s >= end) {
6840 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006842 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006843
Victor Stinner62ec3312016-09-06 17:04:34 -07006844 c = (unsigned char) *s++;
6845 if (c == 'u') {
6846 count = 4;
6847 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006849 else if (c == 'U') {
6850 count = 8;
6851 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006852 }
6853 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006854 assert(writer.pos < writer.size);
6855 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6856 WRITE_CHAR(c);
6857 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006858 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006859 startinpos = s - starts - 2;
6860
6861 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6862 for (ch = 0; count && s < end; ++s, --count) {
6863 c = (unsigned char)*s;
6864 ch <<= 4;
6865 if (c >= '0' && c <= '9') {
6866 ch += c - '0';
6867 }
6868 else if (c >= 'a' && c <= 'f') {
6869 ch += c - ('a' - 10);
6870 }
6871 else if (c >= 'A' && c <= 'F') {
6872 ch += c - ('A' - 10);
6873 }
6874 else {
6875 break;
6876 }
6877 }
6878 if (!count) {
6879 if (ch <= MAX_UNICODE) {
6880 WRITE_CHAR(ch);
6881 continue;
6882 }
6883 message = "\\Uxxxxxxxx out of range";
6884 }
6885
6886 endinpos = s-starts;
6887 writer.min_length = end - s + writer.pos;
6888 if (unicode_decode_call_errorhandler_writer(
6889 errors, &errorHandler,
6890 "rawunicodeescape", message,
6891 &starts, &end, &startinpos, &endinpos, &exc, &s,
6892 &writer)) {
6893 goto onError;
6894 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006895 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006896
6897#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899 Py_XDECREF(errorHandler);
6900 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006901 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006902
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006904 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905 Py_XDECREF(errorHandler);
6906 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006908
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909}
6910
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006911
Alexander Belopolsky40018472011-02-26 01:02:56 +00006912PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006913PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914{
Victor Stinner62ec3312016-09-06 17:04:34 -07006915 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006917 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006918 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006919 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006920 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006922 if (!PyUnicode_Check(unicode)) {
6923 PyErr_BadArgument();
6924 return NULL;
6925 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006926 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006927 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006928 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006929 kind = PyUnicode_KIND(unicode);
6930 data = PyUnicode_DATA(unicode);
6931 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006932 if (kind == PyUnicode_1BYTE_KIND) {
6933 return PyBytes_FromStringAndSize(data, len);
6934 }
Victor Stinner0e368262011-11-10 20:12:49 +01006935
Victor Stinner62ec3312016-09-06 17:04:34 -07006936 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6937 bytes, and 1 byte characters 4. */
6938 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006939
Victor Stinner62ec3312016-09-06 17:04:34 -07006940 if (len > PY_SSIZE_T_MAX / expandsize) {
6941 return PyErr_NoMemory();
6942 }
6943 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6944 if (repr == NULL) {
6945 return NULL;
6946 }
6947 if (len == 0) {
6948 return repr;
6949 }
6950
6951 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006952 for (pos = 0; pos < len; pos++) {
6953 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006954
Victor Stinner62ec3312016-09-06 17:04:34 -07006955 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6956 if (ch < 0x100) {
6957 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006958 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006959 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006960 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961 *p++ = '\\';
6962 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006963 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6964 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6965 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6966 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006968 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6969 else {
6970 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6971 *p++ = '\\';
6972 *p++ = 'U';
6973 *p++ = '0';
6974 *p++ = '0';
6975 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6976 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6977 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6978 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6979 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6980 *p++ = Py_hexdigits[ch & 15];
6981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006983
Victor Stinner62ec3312016-09-06 17:04:34 -07006984 assert(p > PyBytes_AS_STRING(repr));
6985 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6986 return NULL;
6987 }
6988 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989}
6990
Alexander Belopolsky40018472011-02-26 01:02:56 +00006991PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006992PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6993 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006995 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006996 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006997 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006998 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006999 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
7000 Py_DECREF(tmp);
7001 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002}
7003
7004/* --- Latin-1 Codec ------------------------------------------------------ */
7005
Alexander Belopolsky40018472011-02-26 01:02:56 +00007006PyObject *
7007PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007008 Py_ssize_t size,
7009 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06007012 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013}
7014
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007015/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007016static void
7017make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007018 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007019 PyObject *unicode,
7020 Py_ssize_t startpos, Py_ssize_t endpos,
7021 const char *reason)
7022{
7023 if (*exceptionObject == NULL) {
7024 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007025 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01007026 encoding, unicode, startpos, endpos, reason);
7027 }
7028 else {
7029 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7030 goto onError;
7031 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7032 goto onError;
7033 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7034 goto onError;
7035 return;
7036 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02007037 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01007038 }
7039}
7040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007041/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007042static void
7043raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007044 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007045 PyObject *unicode,
7046 Py_ssize_t startpos, Py_ssize_t endpos,
7047 const char *reason)
7048{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007049 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007050 encoding, unicode, startpos, endpos, reason);
7051 if (*exceptionObject != NULL)
7052 PyCodec_StrictErrors(*exceptionObject);
7053}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007054
7055/* error handling callback helper:
7056 build arguments, call the callback and check the arguments,
7057 put the result into newpos and return the replacement string, which
7058 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007059static PyObject *
7060unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007061 PyObject **errorHandler,
7062 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007063 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007064 Py_ssize_t startpos, Py_ssize_t endpos,
7065 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007066{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02007067 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007068 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007069 PyObject *restuple;
7070 PyObject *resunicode;
7071
7072 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007073 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007074 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007076 }
7077
Benjamin Petersonbac79492012-01-14 13:34:47 -05007078 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007079 return NULL;
7080 len = PyUnicode_GET_LENGTH(unicode);
7081
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007082 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007083 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007084 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007086
Petr Viktorinffd97532020-02-11 17:46:57 +01007087 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007088 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007090 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007091 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 Py_DECREF(restuple);
7093 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007094 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007095 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007096 &resunicode, newpos)) {
7097 Py_DECREF(restuple);
7098 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007099 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007100 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7101 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7102 Py_DECREF(restuple);
7103 return NULL;
7104 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007105 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007106 *newpos = len + *newpos;
7107 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007108 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 Py_DECREF(restuple);
7110 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007111 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007112 Py_INCREF(resunicode);
7113 Py_DECREF(restuple);
7114 return resunicode;
7115}
7116
Alexander Belopolsky40018472011-02-26 01:02:56 +00007117static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007118unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007119 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007120 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007121{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007122 /* input state */
7123 Py_ssize_t pos=0, size;
7124 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007125 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007126 /* pointer into the output */
7127 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007128 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7129 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007130 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007131 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007132 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007133 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007134 /* output object */
7135 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007136
Benjamin Petersonbac79492012-01-14 13:34:47 -05007137 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007138 return NULL;
7139 size = PyUnicode_GET_LENGTH(unicode);
7140 kind = PyUnicode_KIND(unicode);
7141 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007142 /* allocate enough for a simple encoding without
7143 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007144 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007145 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007146
7147 _PyBytesWriter_Init(&writer);
7148 str = _PyBytesWriter_Alloc(&writer, size);
7149 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007150 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007151
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007152 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007153 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007154
Benjamin Peterson29060642009-01-31 22:14:21 +00007155 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007156 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007157 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007158 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007159 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007160 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007162 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007163 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007164 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007165 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007167
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007168 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007170
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007171 /* Only overallocate the buffer if it's not the last write */
7172 writer.overallocate = (collend < size);
7173
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007175 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007176 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007177
7178 switch (error_handler) {
7179 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007180 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007182
7183 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007184 memset(str, '?', collend - collstart);
7185 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007186 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007187 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007188 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 break;
Victor Stinner50149202015-09-22 00:26:54 +02007190
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007191 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007192 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007193 writer.min_size -= (collend - collstart);
7194 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007195 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007196 if (str == NULL)
7197 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007198 pos = collend;
7199 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007200
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007201 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007202 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007203 writer.min_size -= (collend - collstart);
7204 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007205 unicode, collstart, collend);
7206 if (str == NULL)
7207 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007208 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 break;
Victor Stinner50149202015-09-22 00:26:54 +02007210
Victor Stinnerc3713e92015-09-29 12:32:13 +02007211 case _Py_ERROR_SURROGATEESCAPE:
7212 for (i = collstart; i < collend; ++i) {
7213 ch = PyUnicode_READ(kind, data, i);
7214 if (ch < 0xdc80 || 0xdcff < ch) {
7215 /* Not a UTF-8b surrogate */
7216 break;
7217 }
7218 *str++ = (char)(ch - 0xdc00);
7219 ++pos;
7220 }
7221 if (i >= collend)
7222 break;
7223 collstart = pos;
7224 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007225 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007226
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007228 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7229 encoding, reason, unicode, &exc,
7230 collstart, collend, &newpos);
7231 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007233
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007234 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007235 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007236
Victor Stinner6bd525b2015-10-09 13:10:05 +02007237 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007238 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007239 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007240 PyBytes_AS_STRING(rep),
7241 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007242 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007243 else {
7244 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007245
Victor Stinner6bd525b2015-10-09 13:10:05 +02007246 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007248
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007249 if (limit == 256 ?
7250 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7251 !PyUnicode_IS_ASCII(rep))
7252 {
7253 /* Not all characters are smaller than limit */
7254 raise_encode_exception(&exc, encoding, unicode,
7255 collstart, collend, reason);
7256 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007258 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7259 str = _PyBytesWriter_WriteBytes(&writer, str,
7260 PyUnicode_DATA(rep),
7261 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007263 if (str == NULL)
7264 goto onError;
7265
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007266 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007267 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007268 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007269
7270 /* If overallocation was disabled, ensure that it was the last
7271 write. Otherwise, we missed an optimization */
7272 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007273 }
7274 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007275
Victor Stinner50149202015-09-22 00:26:54 +02007276 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007277 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007278 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007279
7280 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007281 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007282 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007283 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007284 Py_XDECREF(exc);
7285 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007286}
7287
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007288/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007289PyObject *
7290PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007291 Py_ssize_t size,
7292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007294 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007295 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007296 if (unicode == NULL)
7297 return NULL;
7298 result = unicode_encode_ucs1(unicode, errors, 256);
7299 Py_DECREF(unicode);
7300 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301}
7302
Alexander Belopolsky40018472011-02-26 01:02:56 +00007303PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007304_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305{
7306 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 PyErr_BadArgument();
7308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007310 if (PyUnicode_READY(unicode) == -1)
7311 return NULL;
7312 /* Fast path: if it is a one-byte string, construct
7313 bytes object directly. */
7314 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7315 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7316 PyUnicode_GET_LENGTH(unicode));
7317 /* Non-Latin-1 characters present. Defer to above function to
7318 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007319 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007320}
7321
7322PyObject*
7323PyUnicode_AsLatin1String(PyObject *unicode)
7324{
7325 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326}
7327
7328/* --- 7-bit ASCII Codec -------------------------------------------------- */
7329
Alexander Belopolsky40018472011-02-26 01:02:56 +00007330PyObject *
7331PyUnicode_DecodeASCII(const char *s,
7332 Py_ssize_t size,
7333 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007335 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007336 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007337 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007338 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007339 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007340
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007342 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007343
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007345 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007346 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007348
Inada Naoki770847a2019-06-24 12:30:24 +09007349 // Shortcut for simple case
7350 PyObject *u = PyUnicode_New(size, 127);
7351 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007352 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007353 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007354 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007355 if (outpos == size) {
7356 return u;
7357 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007358
Inada Naoki770847a2019-06-24 12:30:24 +09007359 _PyUnicodeWriter writer;
7360 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007361 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007362
Inada Naoki770847a2019-06-24 12:30:24 +09007363 s += outpos;
7364 int kind = writer.kind;
7365 void *data = writer.data;
7366 Py_ssize_t startinpos, endinpos;
7367
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007368 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007369 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007371 PyUnicode_WRITE(kind, data, writer.pos, c);
7372 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007374 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007376
7377 /* byte outsize range 0x00..0x7f: call the error handler */
7378
7379 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007380 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007381
7382 switch (error_handler)
7383 {
7384 case _Py_ERROR_REPLACE:
7385 case _Py_ERROR_SURROGATEESCAPE:
7386 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007387 but we may switch to UCS2 at the first write */
7388 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7389 goto onError;
7390 kind = writer.kind;
7391 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007392
7393 if (error_handler == _Py_ERROR_REPLACE)
7394 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7395 else
7396 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7397 writer.pos++;
7398 ++s;
7399 break;
7400
7401 case _Py_ERROR_IGNORE:
7402 ++s;
7403 break;
7404
7405 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 startinpos = s-starts;
7407 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007408 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007409 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 "ascii", "ordinal not in range(128)",
7411 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007412 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007414 kind = writer.kind;
7415 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007418 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007419 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007420 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007421
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007423 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007424 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007425 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 return NULL;
7427}
7428
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007429/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007430PyObject *
7431PyUnicode_EncodeASCII(const Py_UNICODE *p,
7432 Py_ssize_t size,
7433 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007435 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007436 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007437 if (unicode == NULL)
7438 return NULL;
7439 result = unicode_encode_ucs1(unicode, errors, 128);
7440 Py_DECREF(unicode);
7441 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442}
7443
Alexander Belopolsky40018472011-02-26 01:02:56 +00007444PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007445_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446{
7447 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 PyErr_BadArgument();
7449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007451 if (PyUnicode_READY(unicode) == -1)
7452 return NULL;
7453 /* Fast path: if it is an ASCII-only string, construct bytes object
7454 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007455 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007456 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7457 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007458 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007459}
7460
7461PyObject *
7462PyUnicode_AsASCIIString(PyObject *unicode)
7463{
7464 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465}
7466
Steve Dowercc16be82016-09-08 10:35:16 -07007467#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007468
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007469/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007470
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007471#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007472#define NEED_RETRY
7473#endif
7474
Steve Dower7ebdda02019-08-21 16:22:33 -07007475/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7476 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7477 both cases also and avoids partial characters overrunning the
7478 length limit in MultiByteToWideChar on Windows */
7479#define DECODING_CHUNK_SIZE (INT_MAX/4)
7480
Victor Stinner3a50e702011-10-18 21:21:00 +02007481#ifndef WC_ERR_INVALID_CHARS
7482# define WC_ERR_INVALID_CHARS 0x0080
7483#endif
7484
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007485static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007486code_page_name(UINT code_page, PyObject **obj)
7487{
7488 *obj = NULL;
7489 if (code_page == CP_ACP)
7490 return "mbcs";
7491 if (code_page == CP_UTF7)
7492 return "CP_UTF7";
7493 if (code_page == CP_UTF8)
7494 return "CP_UTF8";
7495
7496 *obj = PyBytes_FromFormat("cp%u", code_page);
7497 if (*obj == NULL)
7498 return NULL;
7499 return PyBytes_AS_STRING(*obj);
7500}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007501
Victor Stinner3a50e702011-10-18 21:21:00 +02007502static DWORD
7503decode_code_page_flags(UINT code_page)
7504{
7505 if (code_page == CP_UTF7) {
7506 /* The CP_UTF7 decoder only supports flags=0 */
7507 return 0;
7508 }
7509 else
7510 return MB_ERR_INVALID_CHARS;
7511}
7512
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007513/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 * Decode a byte string from a Windows code page into unicode object in strict
7515 * mode.
7516 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007517 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7518 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007519 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007520static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007521decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007522 wchar_t **buf,
7523 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 const char *in,
7525 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007526{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007527 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007528 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007529 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007530
7531 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007533 while ((outsize = MultiByteToWideChar(code_page, flags,
7534 in, insize, NULL, 0)) <= 0)
7535 {
7536 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7537 goto error;
7538 }
7539 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7540 flags = 0;
7541 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007542
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007543 /* Extend a wchar_t* buffer */
7544 Py_ssize_t n = *bufsize; /* Get the current length */
7545 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7546 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007547 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007548 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007549
7550 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007551 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7552 if (outsize <= 0)
7553 goto error;
7554 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007555
Victor Stinner3a50e702011-10-18 21:21:00 +02007556error:
7557 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7558 return -2;
7559 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007560 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007561}
7562
Victor Stinner3a50e702011-10-18 21:21:00 +02007563/*
7564 * Decode a byte string from a code page into unicode object with an error
7565 * handler.
7566 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007567 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007568 * UnicodeDecodeError exception and returns -1 on error.
7569 */
7570static int
7571decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007572 wchar_t **buf,
7573 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007574 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007575 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007576{
7577 const char *startin = in;
7578 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007579 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 /* Ideally, we should get reason from FormatMessage. This is the Windows
7581 2000 English version of the message. */
7582 const char *reason = "No mapping for the Unicode character exists "
7583 "in the target code page.";
7584 /* each step cannot decode more than 1 character, but a character can be
7585 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007586 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007587 int insize;
7588 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 PyObject *errorHandler = NULL;
7590 PyObject *exc = NULL;
7591 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007592 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 DWORD err;
7594 int ret = -1;
7595
7596 assert(size > 0);
7597
7598 encoding = code_page_name(code_page, &encoding_obj);
7599 if (encoding == NULL)
7600 return -1;
7601
Victor Stinner7d00cc12014-03-17 23:08:06 +01007602 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7604 UnicodeDecodeError. */
7605 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7606 if (exc != NULL) {
7607 PyCodec_StrictErrors(exc);
7608 Py_CLEAR(exc);
7609 }
7610 goto error;
7611 }
7612
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007613 /* Extend a wchar_t* buffer */
7614 Py_ssize_t n = *bufsize; /* Get the current length */
7615 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7616 PyErr_NoMemory();
7617 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007618 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007619 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7620 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007622 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007623
7624 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007625 while (in < endin)
7626 {
7627 /* Decode a character */
7628 insize = 1;
7629 do
7630 {
7631 outsize = MultiByteToWideChar(code_page, flags,
7632 in, insize,
7633 buffer, Py_ARRAY_LENGTH(buffer));
7634 if (outsize > 0)
7635 break;
7636 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007637 if (err == ERROR_INVALID_FLAGS && flags) {
7638 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7639 flags = 0;
7640 continue;
7641 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 if (err != ERROR_NO_UNICODE_TRANSLATION
7643 && err != ERROR_INSUFFICIENT_BUFFER)
7644 {
7645 PyErr_SetFromWindowsErr(0);
7646 goto error;
7647 }
7648 insize++;
7649 }
7650 /* 4=maximum length of a UTF-8 sequence */
7651 while (insize <= 4 && (in + insize) <= endin);
7652
7653 if (outsize <= 0) {
7654 Py_ssize_t startinpos, endinpos, outpos;
7655
Victor Stinner7d00cc12014-03-17 23:08:06 +01007656 /* last character in partial decode? */
7657 if (in + insize >= endin && !final)
7658 break;
7659
Victor Stinner3a50e702011-10-18 21:21:00 +02007660 startinpos = in - startin;
7661 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007662 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007663 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 errors, &errorHandler,
7665 encoding, reason,
7666 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007667 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007668 {
7669 goto error;
7670 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007671 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007672 }
7673 else {
7674 in += insize;
7675 memcpy(out, buffer, outsize * sizeof(wchar_t));
7676 out += outsize;
7677 }
7678 }
7679
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007680 /* Shrink the buffer */
7681 assert(out - *buf <= *bufsize);
7682 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007683 /* (in - startin) <= size and size is an int */
7684 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007685
7686error:
7687 Py_XDECREF(encoding_obj);
7688 Py_XDECREF(errorHandler);
7689 Py_XDECREF(exc);
7690 return ret;
7691}
7692
Victor Stinner3a50e702011-10-18 21:21:00 +02007693static PyObject *
7694decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007695 const char *s, Py_ssize_t size,
7696 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007697{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007698 wchar_t *buf = NULL;
7699 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007700 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007701
Victor Stinner3a50e702011-10-18 21:21:00 +02007702 if (code_page < 0) {
7703 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7704 return NULL;
7705 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007706 if (size < 0) {
7707 PyErr_BadInternalCall();
7708 return NULL;
7709 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007710
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007711 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007713
Victor Stinner76a31a62011-11-04 00:05:13 +01007714 do
7715 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007716#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007717 if (size > DECODING_CHUNK_SIZE) {
7718 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007719 final = 0;
7720 done = 0;
7721 }
7722 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007723#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007724 {
7725 chunk_size = (int)size;
7726 final = (consumed == NULL);
7727 done = 1;
7728 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007729
Victor Stinner76a31a62011-11-04 00:05:13 +01007730 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007731 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007732 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007733 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007734 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007735
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007736 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007737 s, chunk_size);
7738 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007739 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007740 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007741 errors, final);
7742 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007743
7744 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007745 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007746 return NULL;
7747 }
7748
7749 if (consumed)
7750 *consumed += converted;
7751
7752 s += converted;
7753 size -= converted;
7754 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007755
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007756 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7757 PyMem_Free(buf);
7758 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007759}
7760
Alexander Belopolsky40018472011-02-26 01:02:56 +00007761PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007762PyUnicode_DecodeCodePageStateful(int code_page,
7763 const char *s,
7764 Py_ssize_t size,
7765 const char *errors,
7766 Py_ssize_t *consumed)
7767{
7768 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7769}
7770
7771PyObject *
7772PyUnicode_DecodeMBCSStateful(const char *s,
7773 Py_ssize_t size,
7774 const char *errors,
7775 Py_ssize_t *consumed)
7776{
7777 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7778}
7779
7780PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007781PyUnicode_DecodeMBCS(const char *s,
7782 Py_ssize_t size,
7783 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007784{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007785 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7786}
7787
Victor Stinner3a50e702011-10-18 21:21:00 +02007788static DWORD
7789encode_code_page_flags(UINT code_page, const char *errors)
7790{
7791 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007792 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007793 }
7794 else if (code_page == CP_UTF7) {
7795 /* CP_UTF7 only supports flags=0 */
7796 return 0;
7797 }
7798 else {
7799 if (errors != NULL && strcmp(errors, "replace") == 0)
7800 return 0;
7801 else
7802 return WC_NO_BEST_FIT_CHARS;
7803 }
7804}
7805
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007806/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007807 * Encode a Unicode string to a Windows code page into a byte string in strict
7808 * mode.
7809 *
7810 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007811 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007812 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007813static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007814encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007815 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007816 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007817{
Victor Stinner554f3f02010-06-16 23:33:54 +00007818 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007819 BOOL *pusedDefaultChar = &usedDefaultChar;
7820 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007821 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007822 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007823 const DWORD flags = encode_code_page_flags(code_page, NULL);
7824 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007825 /* Create a substring so that we can get the UTF-16 representation
7826 of just the slice under consideration. */
7827 PyObject *substring;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007828 int ret = -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007829
Martin v. Löwis3d325192011-11-04 18:23:06 +01007830 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007831
Victor Stinner3a50e702011-10-18 21:21:00 +02007832 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007833 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007834 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007835 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007836
Victor Stinner2fc507f2011-11-04 20:06:39 +01007837 substring = PyUnicode_Substring(unicode, offset, offset+len);
7838 if (substring == NULL)
7839 return -1;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007840#if USE_UNICODE_WCHAR_CACHE
7841_Py_COMP_DIAG_PUSH
7842_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner2fc507f2011-11-04 20:06:39 +01007843 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7844 if (p == NULL) {
7845 Py_DECREF(substring);
7846 return -1;
7847 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007848_Py_COMP_DIAG_POP
7849#else /* USE_UNICODE_WCHAR_CACHE */
7850 p = PyUnicode_AsWideCharString(substring, &size);
7851 Py_CLEAR(substring);
7852 if (p == NULL) {
7853 return -1;
7854 }
7855#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinner9f067f42013-06-05 00:21:31 +02007856 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007857
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007858 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007859 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007860 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007861 NULL, 0,
7862 NULL, pusedDefaultChar);
7863 if (outsize <= 0)
7864 goto error;
7865 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007866 if (pusedDefaultChar && *pusedDefaultChar) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007867 ret = -2;
7868 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007869 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007870
Victor Stinner3a50e702011-10-18 21:21:00 +02007871 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007873 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007874 if (*outbytes == NULL) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007875 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007876 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007877 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007878 }
7879 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007881 const Py_ssize_t n = PyBytes_Size(*outbytes);
7882 if (outsize > PY_SSIZE_T_MAX - n) {
7883 PyErr_NoMemory();
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007884 goto done;
Victor Stinner3a50e702011-10-18 21:21:00 +02007885 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007886 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007887 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007888 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007889 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007890 }
7891
7892 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007893 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007894 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007895 out, outsize,
7896 NULL, pusedDefaultChar);
7897 if (outsize <= 0)
7898 goto error;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007899 if (pusedDefaultChar && *pusedDefaultChar) {
7900 ret = -2;
7901 goto done;
7902 }
7903 ret = 0;
7904
7905done:
7906#if USE_UNICODE_WCHAR_CACHE
7907 Py_DECREF(substring);
7908#else /* USE_UNICODE_WCHAR_CACHE */
7909 PyMem_Free(p);
7910#endif /* USE_UNICODE_WCHAR_CACHE */
7911 return ret;
Victor Stinner554f3f02010-06-16 23:33:54 +00007912
Victor Stinner3a50e702011-10-18 21:21:00 +02007913error:
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007914 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7915 ret = -2;
7916 goto done;
7917 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007918 PyErr_SetFromWindowsErr(0);
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007919 goto done;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007920}
7921
Victor Stinner3a50e702011-10-18 21:21:00 +02007922/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007923 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007924 * error handler.
7925 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007926 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007927 * -1 on other error.
7928 */
7929static int
7930encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007931 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007932 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007933{
Victor Stinner3a50e702011-10-18 21:21:00 +02007934 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007935 Py_ssize_t pos = unicode_offset;
7936 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007937 /* Ideally, we should get reason from FormatMessage. This is the Windows
7938 2000 English version of the message. */
7939 const char *reason = "invalid character";
7940 /* 4=maximum length of a UTF-8 sequence */
7941 char buffer[4];
7942 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7943 Py_ssize_t outsize;
7944 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007945 PyObject *errorHandler = NULL;
7946 PyObject *exc = NULL;
7947 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007948 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007949 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007950 PyObject *rep;
7951 int ret = -1;
7952
7953 assert(insize > 0);
7954
7955 encoding = code_page_name(code_page, &encoding_obj);
7956 if (encoding == NULL)
7957 return -1;
7958
7959 if (errors == NULL || strcmp(errors, "strict") == 0) {
7960 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7961 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007962 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007963 if (exc != NULL) {
7964 PyCodec_StrictErrors(exc);
7965 Py_DECREF(exc);
7966 }
7967 Py_XDECREF(encoding_obj);
7968 return -1;
7969 }
7970
7971 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7972 pusedDefaultChar = &usedDefaultChar;
7973 else
7974 pusedDefaultChar = NULL;
7975
7976 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7977 PyErr_NoMemory();
7978 goto error;
7979 }
7980 outsize = insize * Py_ARRAY_LENGTH(buffer);
7981
7982 if (*outbytes == NULL) {
7983 /* Create string object */
7984 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7985 if (*outbytes == NULL)
7986 goto error;
7987 out = PyBytes_AS_STRING(*outbytes);
7988 }
7989 else {
7990 /* Extend string object */
7991 Py_ssize_t n = PyBytes_Size(*outbytes);
7992 if (n > PY_SSIZE_T_MAX - outsize) {
7993 PyErr_NoMemory();
7994 goto error;
7995 }
7996 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7997 goto error;
7998 out = PyBytes_AS_STRING(*outbytes) + n;
7999 }
8000
8001 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01008002 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02008003 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01008004 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8005 wchar_t chars[2];
8006 int charsize;
8007 if (ch < 0x10000) {
8008 chars[0] = (wchar_t)ch;
8009 charsize = 1;
8010 }
8011 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01008012 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8013 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01008014 charsize = 2;
8015 }
8016
Victor Stinner3a50e702011-10-18 21:21:00 +02008017 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008018 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02008019 buffer, Py_ARRAY_LENGTH(buffer),
8020 NULL, pusedDefaultChar);
8021 if (outsize > 0) {
8022 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8023 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008024 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02008025 memcpy(out, buffer, outsize);
8026 out += outsize;
8027 continue;
8028 }
8029 }
8030 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8031 PyErr_SetFromWindowsErr(0);
8032 goto error;
8033 }
8034
Victor Stinner3a50e702011-10-18 21:21:00 +02008035 rep = unicode_encode_call_errorhandler(
8036 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01008037 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008038 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02008039 if (rep == NULL)
8040 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008041 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02008042
8043 if (PyBytes_Check(rep)) {
8044 outsize = PyBytes_GET_SIZE(rep);
8045 if (outsize != 1) {
8046 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8047 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8048 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8049 Py_DECREF(rep);
8050 goto error;
8051 }
8052 out = PyBytes_AS_STRING(*outbytes) + offset;
8053 }
8054 memcpy(out, PyBytes_AS_STRING(rep), outsize);
8055 out += outsize;
8056 }
8057 else {
8058 Py_ssize_t i;
8059 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008060 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02008061
Benjamin Petersonbac79492012-01-14 13:34:47 -05008062 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02008063 Py_DECREF(rep);
8064 goto error;
8065 }
8066
8067 outsize = PyUnicode_GET_LENGTH(rep);
8068 if (outsize != 1) {
8069 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8070 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8071 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8072 Py_DECREF(rep);
8073 goto error;
8074 }
8075 out = PyBytes_AS_STRING(*outbytes) + offset;
8076 }
8077 kind = PyUnicode_KIND(rep);
8078 data = PyUnicode_DATA(rep);
8079 for (i=0; i < outsize; i++) {
8080 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8081 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008082 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008083 encoding, unicode,
8084 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02008085 "unable to encode error handler result to ASCII");
8086 Py_DECREF(rep);
8087 goto error;
8088 }
8089 *out = (unsigned char)ch;
8090 out++;
8091 }
8092 }
8093 Py_DECREF(rep);
8094 }
8095 /* write a NUL byte */
8096 *out = 0;
8097 outsize = out - PyBytes_AS_STRING(*outbytes);
8098 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8099 if (_PyBytes_Resize(outbytes, outsize) < 0)
8100 goto error;
8101 ret = 0;
8102
8103error:
8104 Py_XDECREF(encoding_obj);
8105 Py_XDECREF(errorHandler);
8106 Py_XDECREF(exc);
8107 return ret;
8108}
8109
Victor Stinner3a50e702011-10-18 21:21:00 +02008110static PyObject *
8111encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008112 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008113 const char *errors)
8114{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008115 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008116 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008117 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008118 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008119
Victor Stinner29dacf22015-01-26 16:41:32 +01008120 if (!PyUnicode_Check(unicode)) {
8121 PyErr_BadArgument();
8122 return NULL;
8123 }
8124
Benjamin Petersonbac79492012-01-14 13:34:47 -05008125 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008126 return NULL;
8127 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008128
Victor Stinner3a50e702011-10-18 21:21:00 +02008129 if (code_page < 0) {
8130 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8131 return NULL;
8132 }
8133
Martin v. Löwis3d325192011-11-04 18:23:06 +01008134 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008135 return PyBytes_FromStringAndSize(NULL, 0);
8136
Victor Stinner7581cef2011-11-03 22:32:33 +01008137 offset = 0;
8138 do
8139 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008140#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008141 if (len > DECODING_CHUNK_SIZE) {
8142 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008143 done = 0;
8144 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008145 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008146#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008147 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008148 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008149 done = 1;
8150 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008151
Victor Stinner76a31a62011-11-04 00:05:13 +01008152 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008153 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008154 errors);
8155 if (ret == -2)
8156 ret = encode_code_page_errors(code_page, &outbytes,
8157 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008158 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008159 if (ret < 0) {
8160 Py_XDECREF(outbytes);
8161 return NULL;
8162 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008163
Victor Stinner7581cef2011-11-03 22:32:33 +01008164 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008165 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008166 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008167
Victor Stinner3a50e702011-10-18 21:21:00 +02008168 return outbytes;
8169}
8170
8171PyObject *
8172PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8173 Py_ssize_t size,
8174 const char *errors)
8175{
Victor Stinner7581cef2011-11-03 22:32:33 +01008176 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008177 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008178 if (unicode == NULL)
8179 return NULL;
8180 res = encode_code_page(CP_ACP, unicode, errors);
8181 Py_DECREF(unicode);
8182 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008183}
8184
8185PyObject *
8186PyUnicode_EncodeCodePage(int code_page,
8187 PyObject *unicode,
8188 const char *errors)
8189{
Victor Stinner7581cef2011-11-03 22:32:33 +01008190 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008191}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008192
Alexander Belopolsky40018472011-02-26 01:02:56 +00008193PyObject *
8194PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008195{
Victor Stinner7581cef2011-11-03 22:32:33 +01008196 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008197}
8198
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008199#undef NEED_RETRY
8200
Steve Dowercc16be82016-09-08 10:35:16 -07008201#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008202
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203/* --- Character Mapping Codec -------------------------------------------- */
8204
Victor Stinnerfb161b12013-04-18 01:44:27 +02008205static int
8206charmap_decode_string(const char *s,
8207 Py_ssize_t size,
8208 PyObject *mapping,
8209 const char *errors,
8210 _PyUnicodeWriter *writer)
8211{
8212 const char *starts = s;
8213 const char *e;
8214 Py_ssize_t startinpos, endinpos;
8215 PyObject *errorHandler = NULL, *exc = NULL;
8216 Py_ssize_t maplen;
8217 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008218 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008219 Py_UCS4 x;
8220 unsigned char ch;
8221
8222 if (PyUnicode_READY(mapping) == -1)
8223 return -1;
8224
8225 maplen = PyUnicode_GET_LENGTH(mapping);
8226 mapdata = PyUnicode_DATA(mapping);
8227 mapkind = PyUnicode_KIND(mapping);
8228
8229 e = s + size;
8230
8231 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8232 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8233 * is disabled in encoding aliases, latin1 is preferred because
8234 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008235 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008236 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8237 Py_UCS4 maxchar = writer->maxchar;
8238
8239 assert (writer->kind == PyUnicode_1BYTE_KIND);
8240 while (s < e) {
8241 ch = *s;
8242 x = mapdata_ucs1[ch];
8243 if (x > maxchar) {
8244 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8245 goto onError;
8246 maxchar = writer->maxchar;
8247 outdata = (Py_UCS1 *)writer->data;
8248 }
8249 outdata[writer->pos] = x;
8250 writer->pos++;
8251 ++s;
8252 }
8253 return 0;
8254 }
8255
8256 while (s < e) {
8257 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8258 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008259 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008260 if (outkind == PyUnicode_1BYTE_KIND) {
8261 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8262 Py_UCS4 maxchar = writer->maxchar;
8263 while (s < e) {
8264 ch = *s;
8265 x = mapdata_ucs2[ch];
8266 if (x > maxchar)
8267 goto Error;
8268 outdata[writer->pos] = x;
8269 writer->pos++;
8270 ++s;
8271 }
8272 break;
8273 }
8274 else if (outkind == PyUnicode_2BYTE_KIND) {
8275 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8276 while (s < e) {
8277 ch = *s;
8278 x = mapdata_ucs2[ch];
8279 if (x == 0xFFFE)
8280 goto Error;
8281 outdata[writer->pos] = x;
8282 writer->pos++;
8283 ++s;
8284 }
8285 break;
8286 }
8287 }
8288 ch = *s;
8289
8290 if (ch < maplen)
8291 x = PyUnicode_READ(mapkind, mapdata, ch);
8292 else
8293 x = 0xfffe; /* invalid value */
8294Error:
8295 if (x == 0xfffe)
8296 {
8297 /* undefined mapping */
8298 startinpos = s-starts;
8299 endinpos = startinpos+1;
8300 if (unicode_decode_call_errorhandler_writer(
8301 errors, &errorHandler,
8302 "charmap", "character maps to <undefined>",
8303 &starts, &e, &startinpos, &endinpos, &exc, &s,
8304 writer)) {
8305 goto onError;
8306 }
8307 continue;
8308 }
8309
8310 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8311 goto onError;
8312 ++s;
8313 }
8314 Py_XDECREF(errorHandler);
8315 Py_XDECREF(exc);
8316 return 0;
8317
8318onError:
8319 Py_XDECREF(errorHandler);
8320 Py_XDECREF(exc);
8321 return -1;
8322}
8323
8324static int
8325charmap_decode_mapping(const char *s,
8326 Py_ssize_t size,
8327 PyObject *mapping,
8328 const char *errors,
8329 _PyUnicodeWriter *writer)
8330{
8331 const char *starts = s;
8332 const char *e;
8333 Py_ssize_t startinpos, endinpos;
8334 PyObject *errorHandler = NULL, *exc = NULL;
8335 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008336 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008337
8338 e = s + size;
8339
8340 while (s < e) {
8341 ch = *s;
8342
8343 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8344 key = PyLong_FromLong((long)ch);
8345 if (key == NULL)
8346 goto onError;
8347
8348 item = PyObject_GetItem(mapping, key);
8349 Py_DECREF(key);
8350 if (item == NULL) {
8351 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8352 /* No mapping found means: mapping is undefined. */
8353 PyErr_Clear();
8354 goto Undefined;
8355 } else
8356 goto onError;
8357 }
8358
8359 /* Apply mapping */
8360 if (item == Py_None)
8361 goto Undefined;
8362 if (PyLong_Check(item)) {
8363 long value = PyLong_AS_LONG(item);
8364 if (value == 0xFFFE)
8365 goto Undefined;
8366 if (value < 0 || value > MAX_UNICODE) {
8367 PyErr_Format(PyExc_TypeError,
Max Bernstein36353882020-10-17 13:38:21 -07008368 "character mapping must be in range(0x%x)",
Victor Stinnerfb161b12013-04-18 01:44:27 +02008369 (unsigned long)MAX_UNICODE + 1);
8370 goto onError;
8371 }
8372
8373 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8374 goto onError;
8375 }
8376 else if (PyUnicode_Check(item)) {
8377 if (PyUnicode_READY(item) == -1)
8378 goto onError;
8379 if (PyUnicode_GET_LENGTH(item) == 1) {
8380 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8381 if (value == 0xFFFE)
8382 goto Undefined;
8383 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8384 goto onError;
8385 }
8386 else {
8387 writer->overallocate = 1;
8388 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8389 goto onError;
8390 }
8391 }
8392 else {
8393 /* wrong return value */
8394 PyErr_SetString(PyExc_TypeError,
8395 "character mapping must return integer, None or str");
8396 goto onError;
8397 }
8398 Py_CLEAR(item);
8399 ++s;
8400 continue;
8401
8402Undefined:
8403 /* undefined mapping */
8404 Py_CLEAR(item);
8405 startinpos = s-starts;
8406 endinpos = startinpos+1;
8407 if (unicode_decode_call_errorhandler_writer(
8408 errors, &errorHandler,
8409 "charmap", "character maps to <undefined>",
8410 &starts, &e, &startinpos, &endinpos, &exc, &s,
8411 writer)) {
8412 goto onError;
8413 }
8414 }
8415 Py_XDECREF(errorHandler);
8416 Py_XDECREF(exc);
8417 return 0;
8418
8419onError:
8420 Py_XDECREF(item);
8421 Py_XDECREF(errorHandler);
8422 Py_XDECREF(exc);
8423 return -1;
8424}
8425
Alexander Belopolsky40018472011-02-26 01:02:56 +00008426PyObject *
8427PyUnicode_DecodeCharmap(const char *s,
8428 Py_ssize_t size,
8429 PyObject *mapping,
8430 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008432 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008433
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 /* Default to Latin-1 */
8435 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008439 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008440 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008441 writer.min_length = size;
8442 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008444
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008445 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008446 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8447 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008448 }
8449 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008450 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8451 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008453 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008454
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008456 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 return NULL;
8458}
8459
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008460/* Charmap encoding: the lookup table */
8461
Alexander Belopolsky40018472011-02-26 01:02:56 +00008462struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 PyObject_HEAD
8464 unsigned char level1[32];
8465 int count2, count3;
8466 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008467};
8468
8469static PyObject*
8470encoding_map_size(PyObject *obj, PyObject* args)
8471{
8472 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008475}
8476
8477static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008478 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 PyDoc_STR("Return the size (in bytes) of this object") },
8480 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008481};
8482
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008483static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008484 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 "EncodingMap", /*tp_name*/
8486 sizeof(struct encoding_map), /*tp_basicsize*/
8487 0, /*tp_itemsize*/
8488 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008489 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008490 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 0, /*tp_getattr*/
8492 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008493 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 0, /*tp_repr*/
8495 0, /*tp_as_number*/
8496 0, /*tp_as_sequence*/
8497 0, /*tp_as_mapping*/
8498 0, /*tp_hash*/
8499 0, /*tp_call*/
8500 0, /*tp_str*/
8501 0, /*tp_getattro*/
8502 0, /*tp_setattro*/
8503 0, /*tp_as_buffer*/
8504 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8505 0, /*tp_doc*/
8506 0, /*tp_traverse*/
8507 0, /*tp_clear*/
8508 0, /*tp_richcompare*/
8509 0, /*tp_weaklistoffset*/
8510 0, /*tp_iter*/
8511 0, /*tp_iternext*/
8512 encoding_map_methods, /*tp_methods*/
8513 0, /*tp_members*/
8514 0, /*tp_getset*/
8515 0, /*tp_base*/
8516 0, /*tp_dict*/
8517 0, /*tp_descr_get*/
8518 0, /*tp_descr_set*/
8519 0, /*tp_dictoffset*/
8520 0, /*tp_init*/
8521 0, /*tp_alloc*/
8522 0, /*tp_new*/
8523 0, /*tp_free*/
8524 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008525};
8526
8527PyObject*
8528PyUnicode_BuildEncodingMap(PyObject* string)
8529{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008530 PyObject *result;
8531 struct encoding_map *mresult;
8532 int i;
8533 int need_dict = 0;
8534 unsigned char level1[32];
8535 unsigned char level2[512];
8536 unsigned char *mlevel1, *mlevel2, *mlevel3;
8537 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008539 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008540 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008542
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008543 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008544 PyErr_BadArgument();
8545 return NULL;
8546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 kind = PyUnicode_KIND(string);
8548 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008549 length = PyUnicode_GET_LENGTH(string);
8550 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008551 memset(level1, 0xFF, sizeof level1);
8552 memset(level2, 0xFF, sizeof level2);
8553
8554 /* If there isn't a one-to-one mapping of NULL to \0,
8555 or if there are non-BMP characters, we need to use
8556 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008558 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008559 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008560 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 ch = PyUnicode_READ(kind, data, i);
8562 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008563 need_dict = 1;
8564 break;
8565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008567 /* unmapped character */
8568 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 l1 = ch >> 11;
8570 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008571 if (level1[l1] == 0xFF)
8572 level1[l1] = count2++;
8573 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008574 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008575 }
8576
8577 if (count2 >= 0xFF || count3 >= 0xFF)
8578 need_dict = 1;
8579
8580 if (need_dict) {
8581 PyObject *result = PyDict_New();
8582 PyObject *key, *value;
8583 if (!result)
8584 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008585 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008587 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008588 if (!key || !value)
8589 goto failed1;
8590 if (PyDict_SetItem(result, key, value) == -1)
8591 goto failed1;
8592 Py_DECREF(key);
8593 Py_DECREF(value);
8594 }
8595 return result;
8596 failed1:
8597 Py_XDECREF(key);
8598 Py_XDECREF(value);
8599 Py_DECREF(result);
8600 return NULL;
8601 }
8602
8603 /* Create a three-level trie */
Victor Stinner32bd68c2020-12-01 10:37:39 +01008604 result = PyObject_Malloc(sizeof(struct encoding_map) +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008605 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008606 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008607 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008608 }
8609
8610 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008611 mresult = (struct encoding_map*)result;
8612 mresult->count2 = count2;
8613 mresult->count3 = count3;
8614 mlevel1 = mresult->level1;
8615 mlevel2 = mresult->level23;
8616 mlevel3 = mresult->level23 + 16*count2;
8617 memcpy(mlevel1, level1, 32);
8618 memset(mlevel2, 0xFF, 16*count2);
8619 memset(mlevel3, 0, 128*count3);
8620 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008621 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008622 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008623 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8624 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008625 /* unmapped character */
8626 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008627 o1 = ch>>11;
8628 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008629 i2 = 16*mlevel1[o1] + o2;
8630 if (mlevel2[i2] == 0xFF)
8631 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008632 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008633 i3 = 128*mlevel2[i2] + o3;
8634 mlevel3[i3] = i;
8635 }
8636 return result;
8637}
8638
8639static int
Victor Stinner22168992011-11-20 17:09:18 +01008640encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008641{
8642 struct encoding_map *map = (struct encoding_map*)mapping;
8643 int l1 = c>>11;
8644 int l2 = (c>>7) & 0xF;
8645 int l3 = c & 0x7F;
8646 int i;
8647
Victor Stinner22168992011-11-20 17:09:18 +01008648 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008650 if (c == 0)
8651 return 0;
8652 /* level 1*/
8653 i = map->level1[l1];
8654 if (i == 0xFF) {
8655 return -1;
8656 }
8657 /* level 2*/
8658 i = map->level23[16*i+l2];
8659 if (i == 0xFF) {
8660 return -1;
8661 }
8662 /* level 3 */
8663 i = map->level23[16*map->count2 + 128*i + l3];
8664 if (i == 0) {
8665 return -1;
8666 }
8667 return i;
8668}
8669
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670/* Lookup the character ch in the mapping. If the character
8671 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008672 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008673static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008674charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675{
Christian Heimes217cfd12007-12-02 14:31:20 +00008676 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 PyObject *x;
8678
8679 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 x = PyObject_GetItem(mapping, w);
8682 Py_DECREF(w);
8683 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8685 /* No mapping found means: mapping is undefined. */
8686 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008687 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 } else
8689 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008691 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008693 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 long value = PyLong_AS_LONG(x);
8695 if (value < 0 || value > 255) {
8696 PyErr_SetString(PyExc_TypeError,
8697 "character mapping must be in range(256)");
8698 Py_DECREF(x);
8699 return NULL;
8700 }
8701 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008703 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 /* wrong return value */
8707 PyErr_Format(PyExc_TypeError,
8708 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008709 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 Py_DECREF(x);
8711 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 }
8713}
8714
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008715static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008716charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008717{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008718 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8719 /* exponentially overallocate to minimize reallocations */
8720 if (requiredsize < 2*outsize)
8721 requiredsize = 2*outsize;
8722 if (_PyBytes_Resize(outobj, requiredsize))
8723 return -1;
8724 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008725}
8726
Benjamin Peterson14339b62009-01-31 16:36:08 +00008727typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008729} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008731 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 space is available. Return a new reference to the object that
8733 was put in the output buffer, or Py_None, if the mapping was undefined
8734 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008735 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008736static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008737charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008738 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008739{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008740 PyObject *rep;
8741 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008742 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008743
Andy Lesterdffe4c02020-03-04 07:15:20 -06008744 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008745 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008747 if (res == -1)
8748 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 if (outsize<requiredsize)
8750 if (charmapencode_resize(outobj, outpos, requiredsize))
8751 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008752 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 outstart[(*outpos)++] = (char)res;
8754 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008755 }
8756
8757 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008760 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 Py_DECREF(rep);
8762 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008763 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 if (PyLong_Check(rep)) {
8765 Py_ssize_t requiredsize = *outpos+1;
8766 if (outsize<requiredsize)
8767 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8768 Py_DECREF(rep);
8769 return enc_EXCEPTION;
8770 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008771 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008773 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 else {
8775 const char *repchars = PyBytes_AS_STRING(rep);
8776 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8777 Py_ssize_t requiredsize = *outpos+repsize;
8778 if (outsize<requiredsize)
8779 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8780 Py_DECREF(rep);
8781 return enc_EXCEPTION;
8782 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008783 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 memcpy(outstart + *outpos, repchars, repsize);
8785 *outpos += repsize;
8786 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008787 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008788 Py_DECREF(rep);
8789 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008790}
8791
8792/* handle an error in PyUnicode_EncodeCharmap
8793 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008794static int
8795charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008796 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008797 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008798 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008799 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008800{
8801 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008802 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008803 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008804 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008805 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008806 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008808 Py_ssize_t collstartpos = *inpos;
8809 Py_ssize_t collendpos = *inpos+1;
8810 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008811 const char *encoding = "charmap";
8812 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008813 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008814 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008815 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816
Benjamin Petersonbac79492012-01-14 13:34:47 -05008817 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008818 return -1;
8819 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008820 /* find all unencodable characters */
8821 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008822 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008823 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008824 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008825 val = encoding_map_lookup(ch, mapping);
8826 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 break;
8828 ++collendpos;
8829 continue;
8830 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008831
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008832 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8833 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 if (rep==NULL)
8835 return -1;
8836 else if (rep!=Py_None) {
8837 Py_DECREF(rep);
8838 break;
8839 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008840 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008842 }
8843 /* cache callback name lookup
8844 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008845 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008846 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008847
8848 switch (*error_handler) {
8849 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008850 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008851 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008852
8853 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008854 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 x = charmapencode_output('?', mapping, res, respos);
8856 if (x==enc_EXCEPTION) {
8857 return -1;
8858 }
8859 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008860 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 return -1;
8862 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008863 }
8864 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008865 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008866 *inpos = collendpos;
8867 break;
Victor Stinner50149202015-09-22 00:26:54 +02008868
8869 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008870 /* generate replacement (temporarily (mis)uses p) */
8871 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 char buffer[2+29+1+1];
8873 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008874 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 for (cp = buffer; *cp; ++cp) {
8876 x = charmapencode_output(*cp, mapping, res, respos);
8877 if (x==enc_EXCEPTION)
8878 return -1;
8879 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008880 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 return -1;
8882 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008883 }
8884 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008885 *inpos = collendpos;
8886 break;
Victor Stinner50149202015-09-22 00:26:54 +02008887
Benjamin Peterson14339b62009-01-31 16:36:08 +00008888 default:
Victor Stinner50149202015-09-22 00:26:54 +02008889 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008890 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008892 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008894 if (PyBytes_Check(repunicode)) {
8895 /* Directly copy bytes result to output. */
8896 Py_ssize_t outsize = PyBytes_Size(*res);
8897 Py_ssize_t requiredsize;
8898 repsize = PyBytes_Size(repunicode);
8899 requiredsize = *respos + repsize;
8900 if (requiredsize > outsize)
8901 /* Make room for all additional bytes. */
8902 if (charmapencode_resize(res, respos, requiredsize)) {
8903 Py_DECREF(repunicode);
8904 return -1;
8905 }
8906 memcpy(PyBytes_AsString(*res) + *respos,
8907 PyBytes_AsString(repunicode), repsize);
8908 *respos += repsize;
8909 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008910 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008911 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008912 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008913 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008914 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008915 Py_DECREF(repunicode);
8916 return -1;
8917 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008918 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008919 data = PyUnicode_DATA(repunicode);
8920 kind = PyUnicode_KIND(repunicode);
8921 for (index = 0; index < repsize; index++) {
8922 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8923 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008925 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 return -1;
8927 }
8928 else if (x==enc_FAILED) {
8929 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008930 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 return -1;
8932 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008933 }
8934 *inpos = newpos;
8935 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008936 }
8937 return 0;
8938}
8939
Alexander Belopolsky40018472011-02-26 01:02:56 +00008940PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008941_PyUnicode_EncodeCharmap(PyObject *unicode,
8942 PyObject *mapping,
8943 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945 /* output object */
8946 PyObject *res = NULL;
8947 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008948 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008949 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008950 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008951 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008952 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008953 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008954 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008955 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008956 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957
Benjamin Petersonbac79492012-01-14 13:34:47 -05008958 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008959 return NULL;
8960 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008961 data = PyUnicode_DATA(unicode);
8962 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008963
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964 /* Default to Latin-1 */
8965 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008966 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008968 /* allocate enough for a simple encoding without
8969 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008970 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008971 if (res == NULL)
8972 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008973 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008976 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008977 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008979 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 if (x==enc_EXCEPTION) /* error */
8981 goto onError;
8982 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008983 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008985 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 &res, &respos)) {
8987 goto onError;
8988 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008989 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 else
8991 /* done with this character => adjust input position */
8992 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008995 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008996 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008997 if (_PyBytes_Resize(&res, respos) < 0)
8998 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008999
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009000 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02009001 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009002 return res;
9003
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009005 Py_XDECREF(res);
9006 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02009007 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 return NULL;
9009}
9010
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009011/* Deprecated */
9012PyObject *
9013PyUnicode_EncodeCharmap(const Py_UNICODE *p,
9014 Py_ssize_t size,
9015 PyObject *mapping,
9016 const char *errors)
9017{
9018 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009019 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009020 if (unicode == NULL)
9021 return NULL;
9022 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
9023 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01009024 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009025}
9026
Alexander Belopolsky40018472011-02-26 01:02:56 +00009027PyObject *
9028PyUnicode_AsCharmapString(PyObject *unicode,
9029 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030{
9031 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 PyErr_BadArgument();
9033 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009035 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036}
9037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009038/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009039static void
9040make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009042 Py_ssize_t startpos, Py_ssize_t endpos,
9043 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009045 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 *exceptionObject = _PyUnicodeTranslateError_Create(
9047 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048 }
9049 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009050 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9051 goto onError;
9052 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9053 goto onError;
9054 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9055 goto onError;
9056 return;
9057 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02009058 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059 }
9060}
9061
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009062/* error handling callback helper:
9063 build arguments, call the callback and check the arguments,
9064 put the result into newpos and return the replacement string, which
9065 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009066static PyObject *
9067unicode_translate_call_errorhandler(const char *errors,
9068 PyObject **errorHandler,
9069 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009071 Py_ssize_t startpos, Py_ssize_t endpos,
9072 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009073{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009074 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009075
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009076 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009077 PyObject *restuple;
9078 PyObject *resunicode;
9079
9080 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009082 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009083 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009084 }
9085
9086 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009088 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009090
Petr Viktorinffd97532020-02-11 17:46:57 +01009091 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009092 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009093 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009094 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009095 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 Py_DECREF(restuple);
9097 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009098 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009099 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009100 &resunicode, &i_newpos)) {
9101 Py_DECREF(restuple);
9102 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009103 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009104 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009106 else
9107 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009109 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 Py_DECREF(restuple);
9111 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009112 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009113 Py_INCREF(resunicode);
9114 Py_DECREF(restuple);
9115 return resunicode;
9116}
9117
9118/* Lookup the character ch in the mapping and put the result in result,
9119 which must be decrefed by the caller.
9120 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009121static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009123{
Christian Heimes217cfd12007-12-02 14:31:20 +00009124 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009125 PyObject *x;
9126
9127 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009129 x = PyObject_GetItem(mapping, w);
9130 Py_DECREF(w);
9131 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9133 /* No mapping found means: use 1:1 mapping. */
9134 PyErr_Clear();
9135 *result = NULL;
9136 return 0;
9137 } else
9138 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009139 }
9140 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 *result = x;
9142 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009143 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009144 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009146 if (value < 0 || value > MAX_UNICODE) {
9147 PyErr_Format(PyExc_ValueError,
9148 "character mapping must be in range(0x%x)",
9149 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 Py_DECREF(x);
9151 return -1;
9152 }
9153 *result = x;
9154 return 0;
9155 }
9156 else if (PyUnicode_Check(x)) {
9157 *result = x;
9158 return 0;
9159 }
9160 else {
9161 /* wrong return value */
9162 PyErr_SetString(PyExc_TypeError,
9163 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009164 Py_DECREF(x);
9165 return -1;
9166 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009167}
Victor Stinner1194ea02014-04-04 19:37:40 +02009168
9169/* lookup the character, write the result into the writer.
9170 Return 1 if the result was written into the writer, return 0 if the mapping
9171 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009172static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009173charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9174 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009175{
Victor Stinner1194ea02014-04-04 19:37:40 +02009176 PyObject *item;
9177
9178 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009179 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009180
9181 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009182 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009183 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009186 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009187 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009188
9189 if (item == Py_None) {
9190 Py_DECREF(item);
9191 return 0;
9192 }
9193
9194 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009195 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9196 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9197 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009198 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9199 Py_DECREF(item);
9200 return -1;
9201 }
9202 Py_DECREF(item);
9203 return 1;
9204 }
9205
9206 if (!PyUnicode_Check(item)) {
9207 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009208 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009209 }
9210
9211 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9212 Py_DECREF(item);
9213 return -1;
9214 }
9215
9216 Py_DECREF(item);
9217 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009218}
9219
Victor Stinner89a76ab2014-04-05 11:44:04 +02009220static int
9221unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9222 Py_UCS1 *translate)
9223{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009224 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009225 int ret = 0;
9226
Victor Stinner89a76ab2014-04-05 11:44:04 +02009227 if (charmaptranslate_lookup(ch, mapping, &item)) {
9228 return -1;
9229 }
9230
9231 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009232 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009233 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009234 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009235 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009236 /* not found => default to 1:1 mapping */
9237 translate[ch] = ch;
9238 return 1;
9239 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009240 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009241 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009242 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9243 used it */
9244 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009245 /* invalid character or character outside ASCII:
9246 skip the fast translate */
9247 goto exit;
9248 }
9249 translate[ch] = (Py_UCS1)replace;
9250 }
9251 else if (PyUnicode_Check(item)) {
9252 Py_UCS4 replace;
9253
9254 if (PyUnicode_READY(item) == -1) {
9255 Py_DECREF(item);
9256 return -1;
9257 }
9258 if (PyUnicode_GET_LENGTH(item) != 1)
9259 goto exit;
9260
9261 replace = PyUnicode_READ_CHAR(item, 0);
9262 if (replace > 127)
9263 goto exit;
9264 translate[ch] = (Py_UCS1)replace;
9265 }
9266 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009267 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009268 goto exit;
9269 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009270 ret = 1;
9271
Benjamin Peterson1365de72014-04-07 20:15:41 -04009272 exit:
9273 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009274 return ret;
9275}
9276
9277/* Fast path for ascii => ascii translation. Return 1 if the whole string
9278 was translated into writer, return 0 if the input string was partially
9279 translated into writer, raise an exception and return -1 on error. */
9280static int
9281unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009282 _PyUnicodeWriter *writer, int ignore,
9283 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009284{
Victor Stinner872b2912014-04-05 14:27:07 +02009285 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009286 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009287 const Py_UCS1 *in, *end;
9288 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009289 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009290
Victor Stinner89a76ab2014-04-05 11:44:04 +02009291 len = PyUnicode_GET_LENGTH(input);
9292
Victor Stinner872b2912014-04-05 14:27:07 +02009293 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009294
9295 in = PyUnicode_1BYTE_DATA(input);
9296 end = in + len;
9297
9298 assert(PyUnicode_IS_ASCII(writer->buffer));
9299 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9300 out = PyUnicode_1BYTE_DATA(writer->buffer);
9301
Victor Stinner872b2912014-04-05 14:27:07 +02009302 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009303 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009304 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009305 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009306 int translate = unicode_fast_translate_lookup(mapping, ch,
9307 ascii_table);
9308 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009309 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009310 if (translate == 0)
9311 goto exit;
9312 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009313 }
Victor Stinner872b2912014-04-05 14:27:07 +02009314 if (ch2 == 0xfe) {
9315 if (ignore)
9316 continue;
9317 goto exit;
9318 }
9319 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009320 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009321 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009322 }
Victor Stinner872b2912014-04-05 14:27:07 +02009323 res = 1;
9324
9325exit:
9326 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009327 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009328 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009329}
9330
Victor Stinner3222da22015-10-01 22:07:32 +02009331static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332_PyUnicode_TranslateCharmap(PyObject *input,
9333 PyObject *mapping,
9334 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009337 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 Py_ssize_t size, i;
9339 int kind;
9340 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009341 _PyUnicodeWriter writer;
9342 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009343 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009344 PyObject *errorHandler = NULL;
9345 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009346 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009347 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009348
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009350 PyErr_BadArgument();
9351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 if (PyUnicode_READY(input) == -1)
9355 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009356 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 kind = PyUnicode_KIND(input);
9358 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009360 if (size == 0)
9361 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009363 /* allocate enough for a simple 1:1 translation without
9364 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009365 _PyUnicodeWriter_Init(&writer);
9366 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368
Victor Stinner872b2912014-04-05 14:27:07 +02009369 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9370
Victor Stinner33798672016-03-01 21:59:58 +01009371 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009372 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009373 if (PyUnicode_IS_ASCII(input)) {
9374 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9375 if (res < 0) {
9376 _PyUnicodeWriter_Dealloc(&writer);
9377 return NULL;
9378 }
9379 if (res == 1)
9380 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009381 }
Victor Stinner33798672016-03-01 21:59:58 +01009382 else {
9383 i = 0;
9384 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009388 int translate;
9389 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9390 Py_ssize_t newpos;
9391 /* startpos for collecting untranslatable chars */
9392 Py_ssize_t collstart;
9393 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009394 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395
Victor Stinner1194ea02014-04-04 19:37:40 +02009396 ch = PyUnicode_READ(kind, data, i);
9397 translate = charmaptranslate_output(ch, mapping, &writer);
9398 if (translate < 0)
9399 goto onError;
9400
9401 if (translate != 0) {
9402 /* it worked => adjust input pointer */
9403 ++i;
9404 continue;
9405 }
9406
9407 /* untranslatable character */
9408 collstart = i;
9409 collend = i+1;
9410
9411 /* find all untranslatable characters */
9412 while (collend < size) {
9413 PyObject *x;
9414 ch = PyUnicode_READ(kind, data, collend);
9415 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009416 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009417 Py_XDECREF(x);
9418 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009419 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009420 ++collend;
9421 }
9422
9423 if (ignore) {
9424 i = collend;
9425 }
9426 else {
9427 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9428 reason, input, &exc,
9429 collstart, collend, &newpos);
9430 if (repunicode == NULL)
9431 goto onError;
9432 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009433 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009434 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009435 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009436 Py_DECREF(repunicode);
9437 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009438 }
9439 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009440 Py_XDECREF(exc);
9441 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009442 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443
Benjamin Peterson29060642009-01-31 22:14:21 +00009444 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009445 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009446 Py_XDECREF(exc);
9447 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448 return NULL;
9449}
9450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451/* Deprecated. Use PyUnicode_Translate instead. */
9452PyObject *
9453PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9454 Py_ssize_t size,
9455 PyObject *mapping,
9456 const char *errors)
9457{
Christian Heimes5f520f42012-09-11 14:03:25 +02009458 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009459 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 if (!unicode)
9461 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009462 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9463 Py_DECREF(unicode);
9464 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465}
9466
Alexander Belopolsky40018472011-02-26 01:02:56 +00009467PyObject *
9468PyUnicode_Translate(PyObject *str,
9469 PyObject *mapping,
9470 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009472 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009473 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009474 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475}
Tim Petersced69f82003-09-16 20:30:58 +00009476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477PyObject *
9478_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9479{
9480 if (!PyUnicode_Check(unicode)) {
9481 PyErr_BadInternalCall();
9482 return NULL;
9483 }
9484 if (PyUnicode_READY(unicode) == -1)
9485 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009486 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 /* If the string is already ASCII, just return the same string */
9488 Py_INCREF(unicode);
9489 return unicode;
9490 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009491
9492 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9493 PyObject *result = PyUnicode_New(len, 127);
9494 if (result == NULL) {
9495 return NULL;
9496 }
9497
9498 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9499 int kind = PyUnicode_KIND(unicode);
9500 const void *data = PyUnicode_DATA(unicode);
9501 Py_ssize_t i;
9502 for (i = 0; i < len; ++i) {
9503 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9504 if (ch < 127) {
9505 out[i] = ch;
9506 }
9507 else if (Py_UNICODE_ISSPACE(ch)) {
9508 out[i] = ' ';
9509 }
9510 else {
9511 int decimal = Py_UNICODE_TODECIMAL(ch);
9512 if (decimal < 0) {
9513 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009514 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009515 _PyUnicode_LENGTH(result) = i + 1;
9516 break;
9517 }
9518 out[i] = '0' + decimal;
9519 }
9520 }
9521
INADA Naoki16dfca42018-07-14 12:06:43 +09009522 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009523 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524}
9525
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009526PyObject *
9527PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9528 Py_ssize_t length)
9529{
Victor Stinnerf0124502011-11-21 23:12:56 +01009530 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009531 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009532 Py_UCS4 maxchar;
9533 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009534 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009535
Victor Stinner99d7ad02012-02-22 13:37:39 +01009536 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009537 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009538 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009539 if (ch > 127) {
9540 int decimal = Py_UNICODE_TODECIMAL(ch);
9541 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009542 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009543 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009544 }
9545 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009546
9547 /* Copy to a new string */
9548 decimal = PyUnicode_New(length, maxchar);
9549 if (decimal == NULL)
9550 return decimal;
9551 kind = PyUnicode_KIND(decimal);
9552 data = PyUnicode_DATA(decimal);
9553 /* Iterate over code points */
9554 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009555 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009556 if (ch > 127) {
9557 int decimal = Py_UNICODE_TODECIMAL(ch);
9558 if (decimal >= 0)
9559 ch = '0' + decimal;
9560 }
9561 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009563 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009564}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009565/* --- Decimal Encoder ---------------------------------------------------- */
9566
Alexander Belopolsky40018472011-02-26 01:02:56 +00009567int
9568PyUnicode_EncodeDecimal(Py_UNICODE *s,
9569 Py_ssize_t length,
9570 char *output,
9571 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009572{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009573 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009574 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009575 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009576 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009577
9578 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 PyErr_BadArgument();
9580 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009581 }
9582
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009583 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009584 if (unicode == NULL)
9585 return -1;
9586
Victor Stinner42bf7752011-11-21 22:52:58 +01009587 kind = PyUnicode_KIND(unicode);
9588 data = PyUnicode_DATA(unicode);
9589
Victor Stinnerb84d7232011-11-22 01:50:07 +01009590 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009591 PyObject *exc;
9592 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009593 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009594 Py_ssize_t startpos;
9595
9596 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009597
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009599 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009600 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009601 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009602 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009603 decimal = Py_UNICODE_TODECIMAL(ch);
9604 if (decimal >= 0) {
9605 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009606 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009607 continue;
9608 }
9609 if (0 < ch && ch < 256) {
9610 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009611 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009612 continue;
9613 }
Victor Stinner6345be92011-11-25 20:09:01 +01009614
Victor Stinner42bf7752011-11-21 22:52:58 +01009615 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009616 exc = NULL;
9617 raise_encode_exception(&exc, "decimal", unicode,
9618 startpos, startpos+1,
9619 "invalid decimal Unicode string");
9620 Py_XDECREF(exc);
9621 Py_DECREF(unicode);
9622 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009623 }
9624 /* 0-terminate the output string */
9625 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009626 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009627 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009628}
9629
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630/* --- Helpers ------------------------------------------------------------ */
9631
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009632/* helper macro to fixup start/end slice values */
9633#define ADJUST_INDICES(start, end, len) \
9634 if (end > len) \
9635 end = len; \
9636 else if (end < 0) { \
9637 end += len; \
9638 if (end < 0) \
9639 end = 0; \
9640 } \
9641 if (start < 0) { \
9642 start += len; \
9643 if (start < 0) \
9644 start = 0; \
9645 }
9646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009648any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009650 Py_ssize_t end,
9651 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009653 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009654 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 Py_ssize_t len1, len2, result;
9656
9657 kind1 = PyUnicode_KIND(s1);
9658 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009659 if (kind1 < kind2)
9660 return -1;
9661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 len1 = PyUnicode_GET_LENGTH(s1);
9663 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009664 ADJUST_INDICES(start, end, len1);
9665 if (end - start < len2)
9666 return -1;
9667
9668 buf1 = PyUnicode_DATA(s1);
9669 buf2 = PyUnicode_DATA(s2);
9670 if (len2 == 1) {
9671 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9672 result = findchar((const char *)buf1 + kind1*start,
9673 kind1, end - start, ch, direction);
9674 if (result == -1)
9675 return -1;
9676 else
9677 return start + result;
9678 }
9679
9680 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009681 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009682 if (!buf2)
9683 return -2;
9684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685
Victor Stinner794d5672011-10-10 03:21:36 +02009686 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009687 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009688 case PyUnicode_1BYTE_KIND:
9689 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9690 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9691 else
9692 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9693 break;
9694 case PyUnicode_2BYTE_KIND:
9695 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9696 break;
9697 case PyUnicode_4BYTE_KIND:
9698 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9699 break;
9700 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009701 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009702 }
9703 }
9704 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009705 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009706 case PyUnicode_1BYTE_KIND:
9707 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9708 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9709 else
9710 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9711 break;
9712 case PyUnicode_2BYTE_KIND:
9713 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9714 break;
9715 case PyUnicode_4BYTE_KIND:
9716 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9717 break;
9718 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009719 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009720 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 }
9722
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009723 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009724 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009725 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726
9727 return result;
9728}
9729
Victor Stinner59423e32018-11-26 13:40:01 +01009730/* _PyUnicode_InsertThousandsGrouping() helper functions */
9731#include "stringlib/localeutil.h"
9732
9733/**
9734 * InsertThousandsGrouping:
9735 * @writer: Unicode writer.
9736 * @n_buffer: Number of characters in @buffer.
9737 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9738 * @d_pos: Start of digits string.
9739 * @n_digits: The number of digits in the string, in which we want
9740 * to put the grouping chars.
9741 * @min_width: The minimum width of the digits in the output string.
9742 * Output will be zero-padded on the left to fill.
9743 * @grouping: see definition in localeconv().
9744 * @thousands_sep: see definition in localeconv().
9745 *
9746 * There are 2 modes: counting and filling. If @writer is NULL,
9747 * we are in counting mode, else filling mode.
9748 * If counting, the required buffer size is returned.
9749 * If filling, we know the buffer will be large enough, so we don't
9750 * need to pass in the buffer size.
9751 * Inserts thousand grouping characters (as defined by grouping and
9752 * thousands_sep) into @writer.
9753 *
9754 * Return value: -1 on error, number of characters otherwise.
9755 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009757_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009758 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009759 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009760 PyObject *digits,
9761 Py_ssize_t d_pos,
9762 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009763 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009764 const char *grouping,
9765 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009766 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767{
Xtreak3f7983a2019-01-07 20:39:14 +05309768 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009769 if (writer) {
9770 assert(digits != NULL);
9771 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009772 }
9773 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009774 assert(digits == NULL);
9775 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009776 }
Victor Stinner59423e32018-11-26 13:40:01 +01009777 assert(0 <= d_pos);
9778 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009779 assert(grouping != NULL);
9780
9781 if (digits != NULL) {
9782 if (PyUnicode_READY(digits) == -1) {
9783 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009784 }
Victor Stinner59423e32018-11-26 13:40:01 +01009785 }
9786 if (PyUnicode_READY(thousands_sep) == -1) {
9787 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009788 }
9789
Victor Stinner59423e32018-11-26 13:40:01 +01009790 Py_ssize_t count = 0;
9791 Py_ssize_t n_zeros;
9792 int loop_broken = 0;
9793 int use_separator = 0; /* First time through, don't append the
9794 separator. They only go between
9795 groups. */
9796 Py_ssize_t buffer_pos;
9797 Py_ssize_t digits_pos;
9798 Py_ssize_t len;
9799 Py_ssize_t n_chars;
9800 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9801 be looked at */
9802 /* A generator that returns all of the grouping widths, until it
9803 returns 0. */
9804 GroupGenerator groupgen;
9805 GroupGenerator_init(&groupgen, grouping);
9806 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9807
9808 /* if digits are not grouped, thousands separator
9809 should be an empty string */
9810 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9811
9812 digits_pos = d_pos + n_digits;
9813 if (writer) {
9814 buffer_pos = writer->pos + n_buffer;
9815 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9816 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 }
Victor Stinner59423e32018-11-26 13:40:01 +01009818 else {
9819 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009820 }
Victor Stinner59423e32018-11-26 13:40:01 +01009821
9822 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009823 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009824 }
Victor Stinner59423e32018-11-26 13:40:01 +01009825
9826 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9827 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9828 n_zeros = Py_MAX(0, len - remaining);
9829 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9830
9831 /* Use n_zero zero's and n_chars chars */
9832
9833 /* Count only, don't do anything. */
9834 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9835
9836 /* Copy into the writer. */
9837 InsertThousandsGrouping_fill(writer, &buffer_pos,
9838 digits, &digits_pos,
9839 n_chars, n_zeros,
9840 use_separator ? thousands_sep : NULL,
9841 thousands_sep_len, maxchar);
9842
9843 /* Use a separator next time. */
9844 use_separator = 1;
9845
9846 remaining -= n_chars;
9847 min_width -= len;
9848
9849 if (remaining <= 0 && min_width <= 0) {
9850 loop_broken = 1;
9851 break;
9852 }
9853 min_width -= thousands_sep_len;
9854 }
9855 if (!loop_broken) {
9856 /* We left the loop without using a break statement. */
9857
9858 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9859 n_zeros = Py_MAX(0, len - remaining);
9860 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9861
9862 /* Use n_zero zero's and n_chars chars */
9863 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9864
9865 /* Copy into the writer. */
9866 InsertThousandsGrouping_fill(writer, &buffer_pos,
9867 digits, &digits_pos,
9868 n_chars, n_zeros,
9869 use_separator ? thousands_sep : NULL,
9870 thousands_sep_len, maxchar);
9871 }
9872 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873}
9874
9875
Alexander Belopolsky40018472011-02-26 01:02:56 +00009876Py_ssize_t
9877PyUnicode_Count(PyObject *str,
9878 PyObject *substr,
9879 Py_ssize_t start,
9880 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009882 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009883 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009884 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009886
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009887 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009888 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009889
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009890 kind1 = PyUnicode_KIND(str);
9891 kind2 = PyUnicode_KIND(substr);
9892 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009893 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009894
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009895 len1 = PyUnicode_GET_LENGTH(str);
9896 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009898 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009899 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009900
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009901 buf1 = PyUnicode_DATA(str);
9902 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009903 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009904 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009905 if (!buf2)
9906 goto onError;
9907 }
9908
9909 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009911 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009912 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009913 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009914 buf2, len2, PY_SSIZE_T_MAX
9915 );
9916 else
9917 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009918 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009919 buf2, len2, PY_SSIZE_T_MAX
9920 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 break;
9922 case PyUnicode_2BYTE_KIND:
9923 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009924 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 buf2, len2, PY_SSIZE_T_MAX
9926 );
9927 break;
9928 case PyUnicode_4BYTE_KIND:
9929 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009930 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 buf2, len2, PY_SSIZE_T_MAX
9932 );
9933 break;
9934 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009935 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009937
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009938 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009939 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009940 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009944 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9945 if (kind2 != kind1)
9946 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948}
9949
Alexander Belopolsky40018472011-02-26 01:02:56 +00009950Py_ssize_t
9951PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009952 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009953 Py_ssize_t start,
9954 Py_ssize_t end,
9955 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009957 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009958 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009959
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009960 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961}
9962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963Py_ssize_t
9964PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9965 Py_ssize_t start, Py_ssize_t end,
9966 int direction)
9967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009969 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 if (PyUnicode_READY(str) == -1)
9971 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009972 len = PyUnicode_GET_LENGTH(str);
9973 ADJUST_INDICES(start, end, len);
9974 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009975 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009977 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9978 kind, end-start, ch, direction);
9979 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009981 else
9982 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983}
9984
Alexander Belopolsky40018472011-02-26 01:02:56 +00009985static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009986tailmatch(PyObject *self,
9987 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009988 Py_ssize_t start,
9989 Py_ssize_t end,
9990 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 int kind_self;
9993 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009994 const void *data_self;
9995 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 Py_ssize_t offset;
9997 Py_ssize_t i;
9998 Py_ssize_t end_sub;
9999
10000 if (PyUnicode_READY(self) == -1 ||
10001 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +010010002 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
10005 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +000010007 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +030010009 if (PyUnicode_GET_LENGTH(substring) == 0)
10010 return 1;
10011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 kind_self = PyUnicode_KIND(self);
10013 data_self = PyUnicode_DATA(self);
10014 kind_sub = PyUnicode_KIND(substring);
10015 data_sub = PyUnicode_DATA(substring);
10016 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
10017
10018 if (direction > 0)
10019 offset = end;
10020 else
10021 offset = start;
10022
10023 if (PyUnicode_READ(kind_self, data_self, offset) ==
10024 PyUnicode_READ(kind_sub, data_sub, 0) &&
10025 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
10026 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
10027 /* If both are of the same kind, memcmp is sufficient */
10028 if (kind_self == kind_sub) {
10029 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010030 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 data_sub,
10032 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010033 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 }
Martin Pantere26da7c2016-06-02 10:07:09 +000010035 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 else {
10037 /* We do not need to compare 0 and len(substring)-1 because
10038 the if statement above ensured already that they are equal
10039 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 for (i = 1; i < end_sub; ++i) {
10041 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10042 PyUnicode_READ(kind_sub, data_sub, i))
10043 return 0;
10044 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010045 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047 }
10048
10049 return 0;
10050}
10051
Alexander Belopolsky40018472011-02-26 01:02:56 +000010052Py_ssize_t
10053PyUnicode_Tailmatch(PyObject *str,
10054 PyObject *substr,
10055 Py_ssize_t start,
10056 Py_ssize_t end,
10057 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010059 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010060 return -1;
Tim Petersced69f82003-09-16 20:30:58 +000010061
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010062 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063}
10064
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010065static PyObject *
10066ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010068 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010069 const char *data = PyUnicode_DATA(self);
10070 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010071 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +000010072
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010073 res = PyUnicode_New(len, 127);
10074 if (res == NULL)
10075 return NULL;
10076 resdata = PyUnicode_DATA(res);
10077 if (lower)
10078 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010080 _Py_bytes_upper(resdata, data, len);
10081 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082}
10083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010085handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010086{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010087 Py_ssize_t j;
10088 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010010089 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010090 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010091
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010092 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10093
10094 where ! is a negation and \p{xxx} is a character with property xxx.
10095 */
10096 for (j = i - 1; j >= 0; j--) {
10097 c = PyUnicode_READ(kind, data, j);
10098 if (!_PyUnicode_IsCaseIgnorable(c))
10099 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010101 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10102 if (final_sigma) {
10103 for (j = i + 1; j < length; j++) {
10104 c = PyUnicode_READ(kind, data, j);
10105 if (!_PyUnicode_IsCaseIgnorable(c))
10106 break;
10107 }
10108 final_sigma = j == length || !_PyUnicode_IsCased(c);
10109 }
10110 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111}
10112
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010113static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010114lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010115 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010116{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010117 /* Obscure special case. */
10118 if (c == 0x3A3) {
10119 mapped[0] = handle_capital_sigma(kind, data, length, i);
10120 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010122 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123}
10124
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010125static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010126do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010128 Py_ssize_t i, k = 0;
10129 int n_res, j;
10130 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010131
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010132 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010133 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010134 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010135 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010136 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010138 for (i = 1; i < length; i++) {
10139 c = PyUnicode_READ(kind, data, i);
10140 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10141 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010142 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010143 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010144 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010145 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010146 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147}
10148
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010149static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010150do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010151 Py_ssize_t i, k = 0;
10152
10153 for (i = 0; i < length; i++) {
10154 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10155 int n_res, j;
10156 if (Py_UNICODE_ISUPPER(c)) {
10157 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10158 }
10159 else if (Py_UNICODE_ISLOWER(c)) {
10160 n_res = _PyUnicode_ToUpperFull(c, mapped);
10161 }
10162 else {
10163 n_res = 1;
10164 mapped[0] = c;
10165 }
10166 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010167 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010168 res[k++] = mapped[j];
10169 }
10170 }
10171 return k;
10172}
10173
10174static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010175do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010176 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010178 Py_ssize_t i, k = 0;
10179
10180 for (i = 0; i < length; i++) {
10181 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10182 int n_res, j;
10183 if (lower)
10184 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10185 else
10186 n_res = _PyUnicode_ToUpperFull(c, mapped);
10187 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010188 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010189 res[k++] = mapped[j];
10190 }
10191 }
10192 return k;
10193}
10194
10195static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010196do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010197{
10198 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10199}
10200
10201static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010202do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010203{
10204 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10205}
10206
Benjamin Petersone51757f2012-01-12 21:10:29 -050010207static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010208do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010209{
10210 Py_ssize_t i, k = 0;
10211
10212 for (i = 0; i < length; i++) {
10213 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10214 Py_UCS4 mapped[3];
10215 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10216 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010217 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010218 res[k++] = mapped[j];
10219 }
10220 }
10221 return k;
10222}
10223
10224static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010225do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010226{
10227 Py_ssize_t i, k = 0;
10228 int previous_is_cased;
10229
10230 previous_is_cased = 0;
10231 for (i = 0; i < length; i++) {
10232 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10233 Py_UCS4 mapped[3];
10234 int n_res, j;
10235
10236 if (previous_is_cased)
10237 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10238 else
10239 n_res = _PyUnicode_ToTitleFull(c, mapped);
10240
10241 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010242 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010243 res[k++] = mapped[j];
10244 }
10245
10246 previous_is_cased = _PyUnicode_IsCased(c);
10247 }
10248 return k;
10249}
10250
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010251static PyObject *
10252case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010253 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010254{
10255 PyObject *res = NULL;
10256 Py_ssize_t length, newlength = 0;
10257 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010258 const void *data;
10259 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010260 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10261
Benjamin Petersoneea48462012-01-16 14:28:50 -050010262 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010263
10264 kind = PyUnicode_KIND(self);
10265 data = PyUnicode_DATA(self);
10266 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010267 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010268 PyErr_SetString(PyExc_OverflowError, "string is too long");
10269 return NULL;
10270 }
Victor Stinner00d7abd2020-12-01 09:56:42 +010010271 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010272 if (tmp == NULL)
10273 return PyErr_NoMemory();
10274 newlength = perform(kind, data, length, tmp, &maxchar);
10275 res = PyUnicode_New(newlength, maxchar);
10276 if (res == NULL)
10277 goto leave;
10278 tmpend = tmp + newlength;
10279 outdata = PyUnicode_DATA(res);
10280 outkind = PyUnicode_KIND(res);
10281 switch (outkind) {
10282 case PyUnicode_1BYTE_KIND:
10283 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10284 break;
10285 case PyUnicode_2BYTE_KIND:
10286 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10287 break;
10288 case PyUnicode_4BYTE_KIND:
10289 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10290 break;
10291 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010292 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010293 }
10294 leave:
Victor Stinner00d7abd2020-12-01 09:56:42 +010010295 PyMem_Free(tmp);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010296 return res;
10297}
10298
Tim Peters8ce9f162004-08-27 01:49:32 +000010299PyObject *
10300PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010302 PyObject *res;
10303 PyObject *fseq;
10304 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010305 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010307 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010308 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010309 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010310 }
10311
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010312 /* NOTE: the following code can't call back into Python code,
10313 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010314 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010315
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010316 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010317 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010318 res = _PyUnicode_JoinArray(separator, items, seqlen);
10319 Py_DECREF(fseq);
10320 return res;
10321}
10322
10323PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010324_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010325{
10326 PyObject *res = NULL; /* the result */
10327 PyObject *sep = NULL;
10328 Py_ssize_t seplen;
10329 PyObject *item;
10330 Py_ssize_t sz, i, res_offset;
10331 Py_UCS4 maxchar;
10332 Py_UCS4 item_maxchar;
10333 int use_memcpy;
10334 unsigned char *res_data = NULL, *sep_data = NULL;
10335 PyObject *last_obj;
10336 unsigned int kind = 0;
10337
Tim Peters05eba1f2004-08-27 21:32:02 +000010338 /* If empty sequence, return u"". */
10339 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010340 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010341 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010342
Tim Peters05eba1f2004-08-27 21:32:02 +000010343 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010344 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010345 if (seqlen == 1) {
10346 if (PyUnicode_CheckExact(items[0])) {
10347 res = items[0];
10348 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010349 return res;
10350 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010351 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010352 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010353 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010354 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010355 /* Set up sep and seplen */
10356 if (separator == NULL) {
10357 /* fall back to a blank space separator */
10358 sep = PyUnicode_FromOrdinal(' ');
10359 if (!sep)
10360 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010361 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010362 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010363 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010364 else {
10365 if (!PyUnicode_Check(separator)) {
10366 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010367 "separator: expected str instance,"
10368 " %.80s found",
10369 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010370 goto onError;
10371 }
10372 if (PyUnicode_READY(separator))
10373 goto onError;
10374 sep = separator;
10375 seplen = PyUnicode_GET_LENGTH(separator);
10376 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10377 /* inc refcount to keep this code path symmetric with the
10378 above case of a blank separator */
10379 Py_INCREF(sep);
10380 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010381 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010382 }
10383
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010384 /* There are at least two things to join, or else we have a subclass
10385 * of str in the sequence.
10386 * Do a pre-pass to figure out the total amount of space we'll
10387 * need (sz), and see whether all argument are strings.
10388 */
10389 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010390#ifdef Py_DEBUG
10391 use_memcpy = 0;
10392#else
10393 use_memcpy = 1;
10394#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010395 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010396 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010397 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010398 if (!PyUnicode_Check(item)) {
10399 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010400 "sequence item %zd: expected str instance,"
10401 " %.80s found",
10402 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010403 goto onError;
10404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 if (PyUnicode_READY(item) == -1)
10406 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010407 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010409 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010410 if (i != 0) {
10411 add_sz += seplen;
10412 }
10413 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010414 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010415 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010416 goto onError;
10417 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010418 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010419 if (use_memcpy && last_obj != NULL) {
10420 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10421 use_memcpy = 0;
10422 }
10423 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010424 }
Tim Petersced69f82003-09-16 20:30:58 +000010425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010427 if (res == NULL)
10428 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010429
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010430 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010431#ifdef Py_DEBUG
10432 use_memcpy = 0;
10433#else
10434 if (use_memcpy) {
10435 res_data = PyUnicode_1BYTE_DATA(res);
10436 kind = PyUnicode_KIND(res);
10437 if (seplen != 0)
10438 sep_data = PyUnicode_1BYTE_DATA(sep);
10439 }
10440#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010441 if (use_memcpy) {
10442 for (i = 0; i < seqlen; ++i) {
10443 Py_ssize_t itemlen;
10444 item = items[i];
10445
10446 /* Copy item, and maybe the separator. */
10447 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010448 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010449 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010450 kind * seplen);
10451 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010452 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010453
10454 itemlen = PyUnicode_GET_LENGTH(item);
10455 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010456 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010457 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010458 kind * itemlen);
10459 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010460 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010461 }
10462 assert(res_data == PyUnicode_1BYTE_DATA(res)
10463 + kind * PyUnicode_GET_LENGTH(res));
10464 }
10465 else {
10466 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10467 Py_ssize_t itemlen;
10468 item = items[i];
10469
10470 /* Copy item, and maybe the separator. */
10471 if (i && seplen != 0) {
10472 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10473 res_offset += seplen;
10474 }
10475
10476 itemlen = PyUnicode_GET_LENGTH(item);
10477 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010478 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010479 res_offset += itemlen;
10480 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010481 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010482 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010483 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010486 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488
Benjamin Peterson29060642009-01-31 22:14:21 +000010489 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010491 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492 return NULL;
10493}
10494
Victor Stinnerd3f08822012-05-29 12:57:52 +020010495void
10496_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10497 Py_UCS4 fill_char)
10498{
10499 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010500 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010501 assert(PyUnicode_IS_READY(unicode));
10502 assert(unicode_modifiable(unicode));
10503 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10504 assert(start >= 0);
10505 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010506 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010507}
10508
Victor Stinner3fe55312012-01-04 00:33:50 +010010509Py_ssize_t
10510PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10511 Py_UCS4 fill_char)
10512{
10513 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010514
10515 if (!PyUnicode_Check(unicode)) {
10516 PyErr_BadInternalCall();
10517 return -1;
10518 }
10519 if (PyUnicode_READY(unicode) == -1)
10520 return -1;
10521 if (unicode_check_modifiable(unicode))
10522 return -1;
10523
Victor Stinnerd3f08822012-05-29 12:57:52 +020010524 if (start < 0) {
10525 PyErr_SetString(PyExc_IndexError, "string index out of range");
10526 return -1;
10527 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010528 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10529 PyErr_SetString(PyExc_ValueError,
10530 "fill character is bigger than "
10531 "the string maximum character");
10532 return -1;
10533 }
10534
10535 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10536 length = Py_MIN(maxlen, length);
10537 if (length <= 0)
10538 return 0;
10539
Victor Stinnerd3f08822012-05-29 12:57:52 +020010540 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010541 return length;
10542}
10543
Victor Stinner9310abb2011-10-05 00:59:23 +020010544static PyObject *
10545pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010546 Py_ssize_t left,
10547 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 PyObject *u;
10551 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010552 int kind;
10553 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554
10555 if (left < 0)
10556 left = 0;
10557 if (right < 0)
10558 right = 0;
10559
Victor Stinnerc4b49542011-12-11 22:44:26 +010010560 if (left == 0 && right == 0)
10561 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10564 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010565 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10566 return NULL;
10567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010569 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010571 if (!u)
10572 return NULL;
10573
10574 kind = PyUnicode_KIND(u);
10575 data = PyUnicode_DATA(u);
10576 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010577 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010578 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010579 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010580 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010581 assert(_PyUnicode_CheckConsistency(u, 1));
10582 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583}
10584
Alexander Belopolsky40018472011-02-26 01:02:56 +000010585PyObject *
10586PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010590 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592
Benjamin Petersonead6b532011-12-20 17:23:42 -060010593 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010595 if (PyUnicode_IS_ASCII(string))
10596 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010597 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010598 PyUnicode_GET_LENGTH(string), keepends);
10599 else
10600 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010601 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010602 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 break;
10604 case PyUnicode_2BYTE_KIND:
10605 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010606 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 PyUnicode_GET_LENGTH(string), keepends);
10608 break;
10609 case PyUnicode_4BYTE_KIND:
10610 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010611 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 PyUnicode_GET_LENGTH(string), keepends);
10613 break;
10614 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010615 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618}
10619
Alexander Belopolsky40018472011-02-26 01:02:56 +000010620static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010621split(PyObject *self,
10622 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010623 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010625 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010626 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 Py_ssize_t len1, len2;
10628 PyObject* out;
10629
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010631 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 if (PyUnicode_READY(self) == -1)
10634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010637 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010639 if (PyUnicode_IS_ASCII(self))
10640 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010641 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010642 PyUnicode_GET_LENGTH(self), maxcount
10643 );
10644 else
10645 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010646 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010647 PyUnicode_GET_LENGTH(self), maxcount
10648 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 case PyUnicode_2BYTE_KIND:
10650 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010651 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 PyUnicode_GET_LENGTH(self), maxcount
10653 );
10654 case PyUnicode_4BYTE_KIND:
10655 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010656 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 PyUnicode_GET_LENGTH(self), maxcount
10658 );
10659 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010660 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 }
10662
10663 if (PyUnicode_READY(substring) == -1)
10664 return NULL;
10665
10666 kind1 = PyUnicode_KIND(self);
10667 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 len1 = PyUnicode_GET_LENGTH(self);
10669 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010670 if (kind1 < kind2 || len1 < len2) {
10671 out = PyList_New(1);
10672 if (out == NULL)
10673 return NULL;
10674 Py_INCREF(self);
10675 PyList_SET_ITEM(out, 0, self);
10676 return out;
10677 }
10678 buf1 = PyUnicode_DATA(self);
10679 buf2 = PyUnicode_DATA(substring);
10680 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010681 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010682 if (!buf2)
10683 return NULL;
10684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010686 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010688 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10689 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010690 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010691 else
10692 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010693 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 break;
10695 case PyUnicode_2BYTE_KIND:
10696 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010697 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 break;
10699 case PyUnicode_4BYTE_KIND:
10700 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010701 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 break;
10703 default:
10704 out = NULL;
10705 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010706 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010707 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010708 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710}
10711
Alexander Belopolsky40018472011-02-26 01:02:56 +000010712static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010713rsplit(PyObject *self,
10714 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010715 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010716{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010717 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010718 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 Py_ssize_t len1, len2;
10720 PyObject* out;
10721
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010722 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010723 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 if (PyUnicode_READY(self) == -1)
10726 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010729 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010731 if (PyUnicode_IS_ASCII(self))
10732 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010733 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010734 PyUnicode_GET_LENGTH(self), maxcount
10735 );
10736 else
10737 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010738 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010739 PyUnicode_GET_LENGTH(self), maxcount
10740 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 case PyUnicode_2BYTE_KIND:
10742 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010743 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 PyUnicode_GET_LENGTH(self), maxcount
10745 );
10746 case PyUnicode_4BYTE_KIND:
10747 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010748 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 PyUnicode_GET_LENGTH(self), maxcount
10750 );
10751 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010752 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 }
10754
10755 if (PyUnicode_READY(substring) == -1)
10756 return NULL;
10757
10758 kind1 = PyUnicode_KIND(self);
10759 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 len1 = PyUnicode_GET_LENGTH(self);
10761 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010762 if (kind1 < kind2 || len1 < len2) {
10763 out = PyList_New(1);
10764 if (out == NULL)
10765 return NULL;
10766 Py_INCREF(self);
10767 PyList_SET_ITEM(out, 0, self);
10768 return out;
10769 }
10770 buf1 = PyUnicode_DATA(self);
10771 buf2 = PyUnicode_DATA(substring);
10772 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010773 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010774 if (!buf2)
10775 return NULL;
10776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010778 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010780 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10781 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010782 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010783 else
10784 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010785 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 break;
10787 case PyUnicode_2BYTE_KIND:
10788 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010789 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 break;
10791 case PyUnicode_4BYTE_KIND:
10792 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010793 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 break;
10795 default:
10796 out = NULL;
10797 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010798 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010799 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010800 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 return out;
10802}
10803
10804static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010805anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10806 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010808 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010810 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10811 return asciilib_find(buf1, len1, buf2, len2, offset);
10812 else
10813 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 case PyUnicode_2BYTE_KIND:
10815 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10816 case PyUnicode_4BYTE_KIND:
10817 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10818 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010819 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820}
10821
10822static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010823anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10824 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010826 switch (kind) {
10827 case PyUnicode_1BYTE_KIND:
10828 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10829 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10830 else
10831 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10832 case PyUnicode_2BYTE_KIND:
10833 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10834 case PyUnicode_4BYTE_KIND:
10835 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10836 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010837 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010838}
10839
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010840static void
10841replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10842 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10843{
10844 int kind = PyUnicode_KIND(u);
10845 void *data = PyUnicode_DATA(u);
10846 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10847 if (kind == PyUnicode_1BYTE_KIND) {
10848 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10849 (Py_UCS1 *)data + len,
10850 u1, u2, maxcount);
10851 }
10852 else if (kind == PyUnicode_2BYTE_KIND) {
10853 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10854 (Py_UCS2 *)data + len,
10855 u1, u2, maxcount);
10856 }
10857 else {
10858 assert(kind == PyUnicode_4BYTE_KIND);
10859 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10860 (Py_UCS4 *)data + len,
10861 u1, u2, maxcount);
10862 }
10863}
10864
Alexander Belopolsky40018472011-02-26 01:02:56 +000010865static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866replace(PyObject *self, PyObject *str1,
10867 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010870 const char *sbuf = PyUnicode_DATA(self);
10871 const void *buf1 = PyUnicode_DATA(str1);
10872 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 int srelease = 0, release1 = 0, release2 = 0;
10874 int skind = PyUnicode_KIND(self);
10875 int kind1 = PyUnicode_KIND(str1);
10876 int kind2 = PyUnicode_KIND(str2);
10877 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10878 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10879 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010880 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010881 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010883 if (slen < len1)
10884 goto nothing;
10885
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010887 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010888 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010889 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
Victor Stinner59de0ee2011-10-07 10:01:28 +020010891 if (str1 == str2)
10892 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893
Victor Stinner49a0a212011-10-12 23:46:10 +020010894 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010895 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10896 if (maxchar < maxchar_str1)
10897 /* substring too wide to be present */
10898 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010899 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10900 /* Replacing str1 with str2 may cause a maxchar reduction in the
10901 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010902 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010903 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010906 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010908 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010910 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010911 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010912 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010913
Victor Stinner69ed0f42013-04-09 21:48:24 +020010914 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010915 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010916 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010917 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010918 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010920 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010922
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010923 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10924 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010925 }
10926 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 int rkind = skind;
10928 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010929 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 if (kind1 < rkind) {
10932 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010933 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 if (!buf1) goto error;
10935 release1 = 1;
10936 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010937 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010938 if (i < 0)
10939 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 if (rkind > kind2) {
10941 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010942 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 if (!buf2) goto error;
10944 release2 = 1;
10945 }
10946 else if (rkind < kind2) {
10947 /* widen self and buf1 */
10948 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010949 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010950 assert(buf1 != PyUnicode_DATA(str1));
10951 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010952 buf1 = PyUnicode_DATA(str1);
10953 release1 = 0;
10954 }
10955 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 if (!sbuf) goto error;
10957 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010958 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 if (!buf1) goto error;
10960 release1 = 1;
10961 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010962 u = PyUnicode_New(slen, maxchar);
10963 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010965 assert(PyUnicode_KIND(u) == rkind);
10966 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010967
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010968 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010969 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010970 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010972 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010974
10975 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010976 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010977 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010978 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010979 if (i == -1)
10980 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010981 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010983 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010987 }
10988 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010990 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 int rkind = skind;
10992 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010995 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010996 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 if (!buf1) goto error;
10998 release1 = 1;
10999 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020011000 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011001 if (n == 0)
11002 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020011004 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011005 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 if (!buf2) goto error;
11007 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020011010 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011012 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 if (!sbuf) goto error;
11014 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011015 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011016 assert(buf1 != PyUnicode_DATA(str1));
11017 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011018 buf1 = PyUnicode_DATA(str1);
11019 release1 = 0;
11020 }
11021 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 if (!buf1) goto error;
11023 release1 = 1;
11024 }
11025 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
11026 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011027 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 PyErr_SetString(PyExc_OverflowError,
11029 "replace string is too long");
11030 goto error;
11031 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010011032 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020011033 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020011034 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020011035 goto done;
11036 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080011037 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 PyErr_SetString(PyExc_OverflowError,
11039 "replace string is too long");
11040 goto error;
11041 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011042 u = PyUnicode_New(new_size, maxchar);
11043 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020011045 assert(PyUnicode_KIND(u) == rkind);
11046 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 ires = i = 0;
11048 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011049 while (n-- > 0) {
11050 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020011051 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011052 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020011053 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011054 if (j == -1)
11055 break;
11056 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011057 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011058 memcpy(res + rkind * ires,
11059 sbuf + rkind * i,
11060 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011062 }
11063 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011065 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011067 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011071 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011073 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011074 memcpy(res + rkind * ires,
11075 sbuf + rkind * i,
11076 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020011077 }
11078 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011079 /* interleave */
11080 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011081 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011083 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011085 if (--n <= 0)
11086 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011087 memcpy(res + rkind * ires,
11088 sbuf + rkind * i,
11089 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 ires++;
11091 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011092 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011093 memcpy(res + rkind * ires,
11094 sbuf + rkind * i,
11095 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011096 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011097 }
11098
11099 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011100 unicode_adjust_maxchar(&u);
11101 if (u == NULL)
11102 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011104
11105 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011106 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11107 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11108 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011110 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011112 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011114 PyMem_Free((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011115 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011117
Benjamin Peterson29060642009-01-31 22:14:21 +000011118 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011119 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011120 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11121 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11122 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011124 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011126 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011127 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011128 PyMem_Free((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011129 return unicode_result_unchanged(self);
11130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011132 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11133 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11134 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11135 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011136 PyMem_Free((void *)sbuf);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011137 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011138 PyMem_Free((void *)buf1);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011139 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011140 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142}
11143
11144/* --- Unicode Object Methods --------------------------------------------- */
11145
INADA Naoki3ae20562017-01-16 20:41:20 +090011146/*[clinic input]
11147str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148
INADA Naoki3ae20562017-01-16 20:41:20 +090011149Return a version of the string where each word is titlecased.
11150
11151More specifically, words start with uppercased characters and all remaining
11152cased characters have lower case.
11153[clinic start generated code]*/
11154
11155static PyObject *
11156unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011157/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011159 if (PyUnicode_READY(self) == -1)
11160 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011161 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162}
11163
INADA Naoki3ae20562017-01-16 20:41:20 +090011164/*[clinic input]
11165str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166
INADA Naoki3ae20562017-01-16 20:41:20 +090011167Return a capitalized version of the string.
11168
11169More specifically, make the first character have upper case and the rest lower
11170case.
11171[clinic start generated code]*/
11172
11173static PyObject *
11174unicode_capitalize_impl(PyObject *self)
11175/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011177 if (PyUnicode_READY(self) == -1)
11178 return NULL;
11179 if (PyUnicode_GET_LENGTH(self) == 0)
11180 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011181 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182}
11183
INADA Naoki3ae20562017-01-16 20:41:20 +090011184/*[clinic input]
11185str.casefold as unicode_casefold
11186
11187Return a version of the string suitable for caseless comparisons.
11188[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011189
11190static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011191unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011192/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011193{
11194 if (PyUnicode_READY(self) == -1)
11195 return NULL;
11196 if (PyUnicode_IS_ASCII(self))
11197 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011198 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011199}
11200
11201
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011202/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011203
11204static int
11205convert_uc(PyObject *obj, void *addr)
11206{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011208
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011209 if (!PyUnicode_Check(obj)) {
11210 PyErr_Format(PyExc_TypeError,
11211 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011212 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011213 return 0;
11214 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011215 if (PyUnicode_READY(obj) < 0)
11216 return 0;
11217 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011218 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011219 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011220 return 0;
11221 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011222 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011223 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011224}
11225
INADA Naoki3ae20562017-01-16 20:41:20 +090011226/*[clinic input]
11227str.center as unicode_center
11228
11229 width: Py_ssize_t
11230 fillchar: Py_UCS4 = ' '
11231 /
11232
11233Return a centered string of length width.
11234
11235Padding is done using the specified fill character (default is a space).
11236[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237
11238static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011239unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11240/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011242 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243
Benjamin Petersonbac79492012-01-14 13:34:47 -050011244 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245 return NULL;
11246
Victor Stinnerc4b49542011-12-11 22:44:26 +010011247 if (PyUnicode_GET_LENGTH(self) >= width)
11248 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249
Victor Stinnerc4b49542011-12-11 22:44:26 +010011250 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251 left = marg / 2 + (marg & width & 1);
11252
Victor Stinner9310abb2011-10-05 00:59:23 +020011253 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254}
11255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256/* This function assumes that str1 and str2 are readied by the caller. */
11257
Marc-André Lemburge5034372000-08-08 08:04:29 +000011258static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011259unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011260{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011261#define COMPARE(TYPE1, TYPE2) \
11262 do { \
11263 TYPE1* p1 = (TYPE1 *)data1; \
11264 TYPE2* p2 = (TYPE2 *)data2; \
11265 TYPE1* end = p1 + len; \
11266 Py_UCS4 c1, c2; \
11267 for (; p1 != end; p1++, p2++) { \
11268 c1 = *p1; \
11269 c2 = *p2; \
11270 if (c1 != c2) \
11271 return (c1 < c2) ? -1 : 1; \
11272 } \
11273 } \
11274 while (0)
11275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011277 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011278 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 kind1 = PyUnicode_KIND(str1);
11281 kind2 = PyUnicode_KIND(str2);
11282 data1 = PyUnicode_DATA(str1);
11283 data2 = PyUnicode_DATA(str2);
11284 len1 = PyUnicode_GET_LENGTH(str1);
11285 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011286 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011287
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011288 switch(kind1) {
11289 case PyUnicode_1BYTE_KIND:
11290 {
11291 switch(kind2) {
11292 case PyUnicode_1BYTE_KIND:
11293 {
11294 int cmp = memcmp(data1, data2, len);
11295 /* normalize result of memcmp() into the range [-1; 1] */
11296 if (cmp < 0)
11297 return -1;
11298 if (cmp > 0)
11299 return 1;
11300 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011301 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011302 case PyUnicode_2BYTE_KIND:
11303 COMPARE(Py_UCS1, Py_UCS2);
11304 break;
11305 case PyUnicode_4BYTE_KIND:
11306 COMPARE(Py_UCS1, Py_UCS4);
11307 break;
11308 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011309 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011310 }
11311 break;
11312 }
11313 case PyUnicode_2BYTE_KIND:
11314 {
11315 switch(kind2) {
11316 case PyUnicode_1BYTE_KIND:
11317 COMPARE(Py_UCS2, Py_UCS1);
11318 break;
11319 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011320 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011321 COMPARE(Py_UCS2, Py_UCS2);
11322 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011323 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011324 case PyUnicode_4BYTE_KIND:
11325 COMPARE(Py_UCS2, Py_UCS4);
11326 break;
11327 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011328 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011329 }
11330 break;
11331 }
11332 case PyUnicode_4BYTE_KIND:
11333 {
11334 switch(kind2) {
11335 case PyUnicode_1BYTE_KIND:
11336 COMPARE(Py_UCS4, Py_UCS1);
11337 break;
11338 case PyUnicode_2BYTE_KIND:
11339 COMPARE(Py_UCS4, Py_UCS2);
11340 break;
11341 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011342 {
11343#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11344 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11345 /* normalize result of wmemcmp() into the range [-1; 1] */
11346 if (cmp < 0)
11347 return -1;
11348 if (cmp > 0)
11349 return 1;
11350#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011351 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011352#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011353 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011354 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011355 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011356 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011357 }
11358 break;
11359 }
11360 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011361 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011362 }
11363
Victor Stinner770e19e2012-10-04 22:59:45 +020011364 if (len1 == len2)
11365 return 0;
11366 if (len1 < len2)
11367 return -1;
11368 else
11369 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011370
11371#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011372}
11373
Benjamin Peterson621b4302016-09-09 13:54:34 -070011374static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011375unicode_compare_eq(PyObject *str1, PyObject *str2)
11376{
11377 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011378 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011379 Py_ssize_t len;
11380 int cmp;
11381
Victor Stinnere5567ad2012-10-23 02:48:49 +020011382 len = PyUnicode_GET_LENGTH(str1);
11383 if (PyUnicode_GET_LENGTH(str2) != len)
11384 return 0;
11385 kind = PyUnicode_KIND(str1);
11386 if (PyUnicode_KIND(str2) != kind)
11387 return 0;
11388 data1 = PyUnicode_DATA(str1);
11389 data2 = PyUnicode_DATA(str2);
11390
11391 cmp = memcmp(data1, data2, len * kind);
11392 return (cmp == 0);
11393}
11394
11395
Alexander Belopolsky40018472011-02-26 01:02:56 +000011396int
11397PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11400 if (PyUnicode_READY(left) == -1 ||
11401 PyUnicode_READY(right) == -1)
11402 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011403
11404 /* a string is equal to itself */
11405 if (left == right)
11406 return 0;
11407
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011408 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011410 PyErr_Format(PyExc_TypeError,
11411 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011412 Py_TYPE(left)->tp_name,
11413 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414 return -1;
11415}
11416
Martin v. Löwis5b222132007-06-10 09:51:05 +000011417int
11418PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 Py_ssize_t i;
11421 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011423 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424
Victor Stinner910337b2011-10-03 03:20:16 +020011425 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011426 if (!PyUnicode_IS_READY(uni)) {
11427 const wchar_t *ws = _PyUnicode_WSTR(uni);
11428 /* Compare Unicode string and source character set string */
11429 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11430 if (chr != ustr[i])
11431 return (chr < ustr[i]) ? -1 : 1;
11432 }
11433 /* This check keeps Python strings that end in '\0' from comparing equal
11434 to C strings identical up to that point. */
11435 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11436 return 1; /* uni is longer */
11437 if (ustr[i])
11438 return -1; /* str is longer */
11439 return 0;
11440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011442 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011443 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011444 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011445 size_t len, len2 = strlen(str);
11446 int cmp;
11447
11448 len = Py_MIN(len1, len2);
11449 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011450 if (cmp != 0) {
11451 if (cmp < 0)
11452 return -1;
11453 else
11454 return 1;
11455 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011456 if (len1 > len2)
11457 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011458 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011459 return -1; /* str is longer */
11460 return 0;
11461 }
11462 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011463 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011464 /* Compare Unicode string and source character set string */
11465 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011466 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011467 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11468 /* This check keeps Python strings that end in '\0' from comparing equal
11469 to C strings identical up to that point. */
11470 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11471 return 1; /* uni is longer */
11472 if (str[i])
11473 return -1; /* str is longer */
11474 return 0;
11475 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011476}
11477
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011478static int
11479non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11480{
11481 size_t i, len;
11482 const wchar_t *p;
11483 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11484 if (strlen(str) != len)
11485 return 0;
11486 p = _PyUnicode_WSTR(unicode);
11487 assert(p);
11488 for (i = 0; i < len; i++) {
11489 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011490 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011491 return 0;
11492 }
11493 return 1;
11494}
11495
11496int
11497_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11498{
11499 size_t len;
11500 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011501 assert(str);
11502#ifndef NDEBUG
11503 for (const char *p = str; *p; p++) {
11504 assert((unsigned char)*p < 128);
11505 }
11506#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011507 if (PyUnicode_READY(unicode) == -1) {
11508 /* Memory error or bad data */
11509 PyErr_Clear();
11510 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11511 }
11512 if (!PyUnicode_IS_ASCII(unicode))
11513 return 0;
11514 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11515 return strlen(str) == len &&
11516 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11517}
11518
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011519int
11520_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11521{
11522 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011523
11524 assert(_PyUnicode_CHECK(left));
11525 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011526#ifndef NDEBUG
11527 for (const char *p = right->string; *p; p++) {
11528 assert((unsigned char)*p < 128);
11529 }
11530#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011531
11532 if (PyUnicode_READY(left) == -1) {
11533 /* memory error or bad data */
11534 PyErr_Clear();
11535 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11536 }
11537
11538 if (!PyUnicode_IS_ASCII(left))
11539 return 0;
11540
11541 right_uni = _PyUnicode_FromId(right); /* borrowed */
11542 if (right_uni == NULL) {
11543 /* memory error or bad data */
11544 PyErr_Clear();
11545 return _PyUnicode_EqualToASCIIString(left, right->string);
11546 }
11547
11548 if (left == right_uni)
11549 return 1;
11550
11551 if (PyUnicode_CHECK_INTERNED(left))
11552 return 0;
11553
INADA Naoki7cc95f52018-01-28 02:07:09 +090011554 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011555 Py_hash_t hash = _PyUnicode_HASH(left);
Victor Stinnerea251802020-12-26 02:58:33 +010011556 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011557 return 0;
Victor Stinnerea251802020-12-26 02:58:33 +010011558 }
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011559
11560 return unicode_compare_eq(left, right_uni);
11561}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011562
Alexander Belopolsky40018472011-02-26 01:02:56 +000011563PyObject *
11564PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011565{
11566 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011567
Victor Stinnere5567ad2012-10-23 02:48:49 +020011568 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11569 Py_RETURN_NOTIMPLEMENTED;
11570
11571 if (PyUnicode_READY(left) == -1 ||
11572 PyUnicode_READY(right) == -1)
11573 return NULL;
11574
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011575 if (left == right) {
11576 switch (op) {
11577 case Py_EQ:
11578 case Py_LE:
11579 case Py_GE:
11580 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011581 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011582 case Py_NE:
11583 case Py_LT:
11584 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011585 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011586 default:
11587 PyErr_BadArgument();
11588 return NULL;
11589 }
11590 }
11591 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011592 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011593 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011594 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011595 }
11596 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011597 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011598 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011599 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011600}
11601
Alexander Belopolsky40018472011-02-26 01:02:56 +000011602int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011603_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11604{
11605 return unicode_eq(aa, bb);
11606}
11607
11608int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011609PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011610{
Victor Stinner77282cb2013-04-14 19:22:47 +020011611 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011612 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011614 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011615
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011616 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011618 "'in <string>' requires string as left operand, not %.100s",
11619 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011620 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011621 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011622 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011623 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011624 if (ensure_unicode(str) < 0)
11625 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011628 kind2 = PyUnicode_KIND(substr);
11629 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011630 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011632 len2 = PyUnicode_GET_LENGTH(substr);
11633 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011634 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011635 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011636 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011637 if (len2 == 1) {
11638 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11639 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011640 return result;
11641 }
11642 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011643 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011644 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011645 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647
Victor Stinner77282cb2013-04-14 19:22:47 +020011648 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 case PyUnicode_1BYTE_KIND:
11650 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11651 break;
11652 case PyUnicode_2BYTE_KIND:
11653 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11654 break;
11655 case PyUnicode_4BYTE_KIND:
11656 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11657 break;
11658 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011659 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011661
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011662 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011663 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011664 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665
Guido van Rossum403d68b2000-03-13 15:55:09 +000011666 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011667}
11668
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669/* Concat to string or Unicode object giving a new Unicode object. */
11670
Alexander Belopolsky40018472011-02-26 01:02:56 +000011671PyObject *
11672PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011674 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011675 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011676 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011678 if (ensure_unicode(left) < 0)
11679 return NULL;
11680
11681 if (!PyUnicode_Check(right)) {
11682 PyErr_Format(PyExc_TypeError,
11683 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011684 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011685 return NULL;
11686 }
11687 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011688 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
11690 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011691 PyObject *empty = unicode_get_empty(); // Borrowed reference
11692 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011693 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011694 }
11695 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011696 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011699 left_len = PyUnicode_GET_LENGTH(left);
11700 right_len = PyUnicode_GET_LENGTH(right);
11701 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011702 PyErr_SetString(PyExc_OverflowError,
11703 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011704 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011705 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011706 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011707
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011708 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11709 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011710 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011713 result = PyUnicode_New(new_len, maxchar);
11714 if (result == NULL)
11715 return NULL;
11716 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11717 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11718 assert(_PyUnicode_CheckConsistency(result, 1));
11719 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720}
11721
Walter Dörwald1ab83302007-05-18 17:15:44 +000011722void
Victor Stinner23e56682011-10-03 03:54:37 +020011723PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011724{
Victor Stinner23e56682011-10-03 03:54:37 +020011725 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011726 Py_UCS4 maxchar, maxchar2;
11727 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011728
11729 if (p_left == NULL) {
11730 if (!PyErr_Occurred())
11731 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011732 return;
11733 }
Victor Stinner23e56682011-10-03 03:54:37 +020011734 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011735 if (right == NULL || left == NULL
11736 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011737 if (!PyErr_Occurred())
11738 PyErr_BadInternalCall();
11739 goto error;
11740 }
11741
Benjamin Petersonbac79492012-01-14 13:34:47 -050011742 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011743 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011744 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011745 goto error;
11746
Victor Stinner488fa492011-12-12 00:01:39 +010011747 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011748 PyObject *empty = unicode_get_empty(); // Borrowed reference
11749 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011750 Py_DECREF(left);
11751 Py_INCREF(right);
11752 *p_left = right;
11753 return;
11754 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011755 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011756 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011757 }
Victor Stinner488fa492011-12-12 00:01:39 +010011758
11759 left_len = PyUnicode_GET_LENGTH(left);
11760 right_len = PyUnicode_GET_LENGTH(right);
11761 if (left_len > PY_SSIZE_T_MAX - right_len) {
11762 PyErr_SetString(PyExc_OverflowError,
11763 "strings are too large to concat");
11764 goto error;
11765 }
11766 new_len = left_len + right_len;
11767
11768 if (unicode_modifiable(left)
11769 && PyUnicode_CheckExact(right)
11770 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011771 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11772 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011773 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011774 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011775 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11776 {
11777 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011778 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011779 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011780
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011781 /* copy 'right' into the newly allocated area of 'left' */
11782 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011783 }
Victor Stinner488fa492011-12-12 00:01:39 +010011784 else {
11785 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11786 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011787 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011788
Victor Stinner488fa492011-12-12 00:01:39 +010011789 /* Concat the two Unicode strings */
11790 res = PyUnicode_New(new_len, maxchar);
11791 if (res == NULL)
11792 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011793 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11794 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011795 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011796 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011797 }
11798 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011799 return;
11800
11801error:
Victor Stinner488fa492011-12-12 00:01:39 +010011802 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011803}
11804
11805void
11806PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11807{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011808 PyUnicode_Append(pleft, right);
11809 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011810}
11811
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011812/*
11813Wraps stringlib_parse_args_finds() and additionally ensures that the
11814first argument is a unicode object.
11815*/
11816
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011817static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011818parse_args_finds_unicode(const char * function_name, PyObject *args,
11819 PyObject **substring,
11820 Py_ssize_t *start, Py_ssize_t *end)
11821{
11822 if(stringlib_parse_args_finds(function_name, args, substring,
11823 start, end)) {
11824 if (ensure_unicode(*substring) < 0)
11825 return 0;
11826 return 1;
11827 }
11828 return 0;
11829}
11830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011831PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011834Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011835string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011836interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837
11838static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011839unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011841 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011842 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011843 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011845 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011846 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011849 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011850 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 kind1 = PyUnicode_KIND(self);
11853 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011854 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011855 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 len1 = PyUnicode_GET_LENGTH(self);
11858 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011860 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011861 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011862
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011863 buf1 = PyUnicode_DATA(self);
11864 buf2 = PyUnicode_DATA(substring);
11865 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011866 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011867 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011868 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011869 }
11870 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 case PyUnicode_1BYTE_KIND:
11872 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011873 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 buf2, len2, PY_SSIZE_T_MAX
11875 );
11876 break;
11877 case PyUnicode_2BYTE_KIND:
11878 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011879 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 buf2, len2, PY_SSIZE_T_MAX
11881 );
11882 break;
11883 case PyUnicode_4BYTE_KIND:
11884 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011885 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 buf2, len2, PY_SSIZE_T_MAX
11887 );
11888 break;
11889 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011890 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 }
11892
11893 result = PyLong_FromSsize_t(iresult);
11894
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011895 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011896 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011897 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899 return result;
11900}
11901
INADA Naoki3ae20562017-01-16 20:41:20 +090011902/*[clinic input]
11903str.encode as unicode_encode
11904
11905 encoding: str(c_default="NULL") = 'utf-8'
11906 The encoding in which to encode the string.
11907 errors: str(c_default="NULL") = 'strict'
11908 The error handling scheme to use for encoding errors.
11909 The default is 'strict' meaning that encoding errors raise a
11910 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11911 'xmlcharrefreplace' as well as any other name registered with
11912 codecs.register_error that can handle UnicodeEncodeErrors.
11913
11914Encode the string using the codec registered for encoding.
11915[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916
11917static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011918unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011919/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011921 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011922}
11923
INADA Naoki3ae20562017-01-16 20:41:20 +090011924/*[clinic input]
11925str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
INADA Naoki3ae20562017-01-16 20:41:20 +090011927 tabsize: int = 8
11928
11929Return a copy where all tab characters are expanded using spaces.
11930
11931If tabsize is not given, a tab size of 8 characters is assumed.
11932[clinic start generated code]*/
11933
11934static PyObject *
11935unicode_expandtabs_impl(PyObject *self, int tabsize)
11936/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011938 Py_ssize_t i, j, line_pos, src_len, incr;
11939 Py_UCS4 ch;
11940 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011941 const void *src_data;
11942 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011943 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011944 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945
Antoine Pitrou22425222011-10-04 19:10:51 +020011946 if (PyUnicode_READY(self) == -1)
11947 return NULL;
11948
Thomas Wouters7e474022000-07-16 12:04:32 +000011949 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011950 src_len = PyUnicode_GET_LENGTH(self);
11951 i = j = line_pos = 0;
11952 kind = PyUnicode_KIND(self);
11953 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011954 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011955 for (; i < src_len; i++) {
11956 ch = PyUnicode_READ(kind, src_data, i);
11957 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011958 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011960 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011961 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011962 goto overflow;
11963 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011965 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011966 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011968 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011969 goto overflow;
11970 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011972 if (ch == '\n' || ch == '\r')
11973 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011975 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011976 if (!found)
11977 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011978
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011980 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981 if (!u)
11982 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011983 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984
Antoine Pitroue71d5742011-10-04 15:55:09 +020011985 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986
Antoine Pitroue71d5742011-10-04 15:55:09 +020011987 for (; i < src_len; i++) {
11988 ch = PyUnicode_READ(kind, src_data, i);
11989 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011991 incr = tabsize - (line_pos % tabsize);
11992 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011993 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011994 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011996 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011998 line_pos++;
11999 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012000 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020012001 if (ch == '\n' || ch == '\r')
12002 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020012004 }
12005 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010012006 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012007
Antoine Pitroue71d5742011-10-04 15:55:09 +020012008 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012009 PyErr_SetString(PyExc_OverflowError, "new string is too long");
12010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011}
12012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012013PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015\n\
12016Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012017such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018arguments start and end are interpreted as in slice notation.\n\
12019\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012020Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021
12022static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012025 /* initialize variables to prevent gcc warning */
12026 PyObject *substring = NULL;
12027 Py_ssize_t start = 0;
12028 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012029 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012031 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012034 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012037 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 if (result == -2)
12040 return NULL;
12041
Christian Heimes217cfd12007-12-02 14:31:20 +000012042 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043}
12044
12045static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012046unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012048 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012049 enum PyUnicode_Kind kind;
12050 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012051
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012052 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012053 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012055 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012056 if (PyUnicode_READY(self) == -1) {
12057 return NULL;
12058 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012059 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12060 PyErr_SetString(PyExc_IndexError, "string index out of range");
12061 return NULL;
12062 }
12063 kind = PyUnicode_KIND(self);
12064 data = PyUnicode_DATA(self);
12065 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010012066 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067}
12068
Guido van Rossumc2504932007-09-18 19:42:40 +000012069/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010012070 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000012071static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012072unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080012074 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000012075
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012076#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050012077 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012078#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 if (_PyUnicode_HASH(self) != -1)
12080 return _PyUnicode_HASH(self);
12081 if (PyUnicode_READY(self) == -1)
12082 return -1;
animalizea1d14252019-01-02 20:16:06 +080012083
Christian Heimes985ecdc2013-11-20 11:46:18 +010012084 x = _Py_HashBytes(PyUnicode_DATA(self),
12085 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000012087 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088}
12089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012090PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012091 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092\n\
oldkaa0735f2018-02-02 16:52:55 +080012093Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012094such that sub is contained within S[start:end]. Optional\n\
12095arguments start and end are interpreted as in slice notation.\n\
12096\n\
12097Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
12099static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012102 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012103 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012104 PyObject *substring = NULL;
12105 Py_ssize_t start = 0;
12106 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012108 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012111 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012114 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 if (result == -2)
12117 return NULL;
12118
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119 if (result < 0) {
12120 PyErr_SetString(PyExc_ValueError, "substring not found");
12121 return NULL;
12122 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012123
Christian Heimes217cfd12007-12-02 14:31:20 +000012124 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125}
12126
INADA Naoki3ae20562017-01-16 20:41:20 +090012127/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012128str.isascii as unicode_isascii
12129
12130Return True if all characters in the string are ASCII, False otherwise.
12131
12132ASCII characters have code points in the range U+0000-U+007F.
12133Empty string is ASCII too.
12134[clinic start generated code]*/
12135
12136static PyObject *
12137unicode_isascii_impl(PyObject *self)
12138/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12139{
12140 if (PyUnicode_READY(self) == -1) {
12141 return NULL;
12142 }
12143 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12144}
12145
12146/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012147str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148
INADA Naoki3ae20562017-01-16 20:41:20 +090012149Return True if the string is a lowercase string, False otherwise.
12150
12151A string is lowercase if all cased characters in the string are lowercase and
12152there is at least one cased character in the string.
12153[clinic start generated code]*/
12154
12155static PyObject *
12156unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012157/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 Py_ssize_t i, length;
12160 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012161 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162 int cased;
12163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 if (PyUnicode_READY(self) == -1)
12165 return NULL;
12166 length = PyUnicode_GET_LENGTH(self);
12167 kind = PyUnicode_KIND(self);
12168 data = PyUnicode_DATA(self);
12169
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 if (length == 1)
12172 return PyBool_FromLong(
12173 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012175 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012177 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012178
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180 for (i = 0; i < length; i++) {
12181 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012182
Benjamin Peterson29060642009-01-31 22:14:21 +000012183 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012184 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012185 else if (!cased && Py_UNICODE_ISLOWER(ch))
12186 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012188 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189}
12190
INADA Naoki3ae20562017-01-16 20:41:20 +090012191/*[clinic input]
12192str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193
INADA Naoki3ae20562017-01-16 20:41:20 +090012194Return True if the string is an uppercase string, False otherwise.
12195
12196A string is uppercase if all cased characters in the string are uppercase and
12197there is at least one cased character in the string.
12198[clinic start generated code]*/
12199
12200static PyObject *
12201unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012202/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 Py_ssize_t i, length;
12205 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012206 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207 int cased;
12208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 if (PyUnicode_READY(self) == -1)
12210 return NULL;
12211 length = PyUnicode_GET_LENGTH(self);
12212 kind = PyUnicode_KIND(self);
12213 data = PyUnicode_DATA(self);
12214
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 if (length == 1)
12217 return PyBool_FromLong(
12218 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012220 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012222 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012223
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 for (i = 0; i < length; i++) {
12226 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012227
Benjamin Peterson29060642009-01-31 22:14:21 +000012228 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012229 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 else if (!cased && Py_UNICODE_ISUPPER(ch))
12231 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012233 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234}
12235
INADA Naoki3ae20562017-01-16 20:41:20 +090012236/*[clinic input]
12237str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238
INADA Naoki3ae20562017-01-16 20:41:20 +090012239Return True if the string is a title-cased string, False otherwise.
12240
12241In a title-cased string, upper- and title-case characters may only
12242follow uncased characters and lowercase characters only cased ones.
12243[clinic start generated code]*/
12244
12245static PyObject *
12246unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012247/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 Py_ssize_t i, length;
12250 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012251 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252 int cased, previous_is_cased;
12253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 if (PyUnicode_READY(self) == -1)
12255 return NULL;
12256 length = PyUnicode_GET_LENGTH(self);
12257 kind = PyUnicode_KIND(self);
12258 data = PyUnicode_DATA(self);
12259
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 if (length == 1) {
12262 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12263 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12264 (Py_UNICODE_ISUPPER(ch) != 0));
12265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012267 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012269 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012270
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 cased = 0;
12272 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 for (i = 0; i < length; i++) {
12274 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012275
Benjamin Peterson29060642009-01-31 22:14:21 +000012276 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12277 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012278 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 previous_is_cased = 1;
12280 cased = 1;
12281 }
12282 else if (Py_UNICODE_ISLOWER(ch)) {
12283 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012284 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 previous_is_cased = 1;
12286 cased = 1;
12287 }
12288 else
12289 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012291 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292}
12293
INADA Naoki3ae20562017-01-16 20:41:20 +090012294/*[clinic input]
12295str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
INADA Naoki3ae20562017-01-16 20:41:20 +090012297Return True if the string is a whitespace string, False otherwise.
12298
12299A string is whitespace if all characters in the string are whitespace and there
12300is at least one character in the string.
12301[clinic start generated code]*/
12302
12303static PyObject *
12304unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012305/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 Py_ssize_t i, length;
12308 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012309 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310
12311 if (PyUnicode_READY(self) == -1)
12312 return NULL;
12313 length = PyUnicode_GET_LENGTH(self);
12314 kind = PyUnicode_KIND(self);
12315 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 if (length == 1)
12319 return PyBool_FromLong(
12320 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012322 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012324 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 for (i = 0; i < length; i++) {
12327 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012328 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012329 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012331 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332}
12333
INADA Naoki3ae20562017-01-16 20:41:20 +090012334/*[clinic input]
12335str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012336
INADA Naoki3ae20562017-01-16 20:41:20 +090012337Return True if the string is an alphabetic string, False otherwise.
12338
12339A string is alphabetic if all characters in the string are alphabetic and there
12340is at least one character in the string.
12341[clinic start generated code]*/
12342
12343static PyObject *
12344unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012345/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012346{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 Py_ssize_t i, length;
12348 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012349 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350
12351 if (PyUnicode_READY(self) == -1)
12352 return NULL;
12353 length = PyUnicode_GET_LENGTH(self);
12354 kind = PyUnicode_KIND(self);
12355 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012356
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012357 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 if (length == 1)
12359 return PyBool_FromLong(
12360 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012361
12362 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012364 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 for (i = 0; i < length; i++) {
12367 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012368 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012369 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012370 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012371}
12372
INADA Naoki3ae20562017-01-16 20:41:20 +090012373/*[clinic input]
12374str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012375
INADA Naoki3ae20562017-01-16 20:41:20 +090012376Return True if the string is an alpha-numeric string, False otherwise.
12377
12378A string is alpha-numeric if all characters in the string are alpha-numeric and
12379there is at least one character in the string.
12380[clinic start generated code]*/
12381
12382static PyObject *
12383unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012384/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012385{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012387 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 Py_ssize_t len, i;
12389
12390 if (PyUnicode_READY(self) == -1)
12391 return NULL;
12392
12393 kind = PyUnicode_KIND(self);
12394 data = PyUnicode_DATA(self);
12395 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012396
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012397 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 if (len == 1) {
12399 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12400 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12401 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012402
12403 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012405 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407 for (i = 0; i < len; i++) {
12408 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012409 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012410 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012411 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012412 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012413}
12414
INADA Naoki3ae20562017-01-16 20:41:20 +090012415/*[clinic input]
12416str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417
INADA Naoki3ae20562017-01-16 20:41:20 +090012418Return True if the string is a decimal string, False otherwise.
12419
12420A string is a decimal string if all characters in the string are decimal and
12421there is at least one character in the string.
12422[clinic start generated code]*/
12423
12424static PyObject *
12425unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012426/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 Py_ssize_t i, length;
12429 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012430 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431
12432 if (PyUnicode_READY(self) == -1)
12433 return NULL;
12434 length = PyUnicode_GET_LENGTH(self);
12435 kind = PyUnicode_KIND(self);
12436 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437
Guido van Rossumd57fd912000-03-10 22:53:23 +000012438 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 if (length == 1)
12440 return PyBool_FromLong(
12441 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012443 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012445 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 for (i = 0; i < length; i++) {
12448 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012449 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012451 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452}
12453
INADA Naoki3ae20562017-01-16 20:41:20 +090012454/*[clinic input]
12455str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012456
INADA Naoki3ae20562017-01-16 20:41:20 +090012457Return True if the string is a digit string, False otherwise.
12458
12459A string is a digit string if all characters in the string are digits and there
12460is at least one character in the string.
12461[clinic start generated code]*/
12462
12463static PyObject *
12464unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012465/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 Py_ssize_t i, length;
12468 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012469 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470
12471 if (PyUnicode_READY(self) == -1)
12472 return NULL;
12473 length = PyUnicode_GET_LENGTH(self);
12474 kind = PyUnicode_KIND(self);
12475 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 if (length == 1) {
12479 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12480 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012483 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012484 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012485 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 for (i = 0; i < length; i++) {
12488 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012489 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012491 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492}
12493
INADA Naoki3ae20562017-01-16 20:41:20 +090012494/*[clinic input]
12495str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496
INADA Naoki3ae20562017-01-16 20:41:20 +090012497Return True if the string is a numeric string, False otherwise.
12498
12499A string is numeric if all characters in the string are numeric and there is at
12500least one character in the string.
12501[clinic start generated code]*/
12502
12503static PyObject *
12504unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012505/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 Py_ssize_t i, length;
12508 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012509 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012510
12511 if (PyUnicode_READY(self) == -1)
12512 return NULL;
12513 length = PyUnicode_GET_LENGTH(self);
12514 kind = PyUnicode_KIND(self);
12515 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 if (length == 1)
12519 return PyBool_FromLong(
12520 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012522 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012524 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 for (i = 0; i < length; i++) {
12527 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012528 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012530 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531}
12532
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012533Py_ssize_t
12534_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012535{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012537 if (PyUnicode_READY(self) == -1)
12538 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012539
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012540 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012541 if (len == 0) {
12542 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012543 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 }
12545
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012546 int kind = PyUnicode_KIND(self);
12547 const void *data = PyUnicode_DATA(self);
12548 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012549 /* PEP 3131 says that the first character must be in
12550 XID_Start and subsequent characters in XID_Continue,
12551 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012552 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012553 letters, digits, underscore). However, given the current
12554 definition of XID_Start and XID_Continue, it is sufficient
12555 to check just for these, except that _ must be allowed
12556 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012557 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012558 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012559 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012560
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012561 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012562 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012563 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012564 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012565 }
12566 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012567 return i;
12568}
12569
12570int
12571PyUnicode_IsIdentifier(PyObject *self)
12572{
12573 if (PyUnicode_IS_READY(self)) {
12574 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12575 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12576 /* an empty string is not a valid identifier */
12577 return len && i == len;
12578 }
12579 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012580_Py_COMP_DIAG_PUSH
12581_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012582 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012583 if (len == 0) {
12584 /* an empty string is not a valid identifier */
12585 return 0;
12586 }
12587
12588 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012589 Py_UCS4 ch = wstr[i++];
12590#if SIZEOF_WCHAR_T == 2
12591 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12592 && i < len
12593 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12594 {
12595 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12596 i++;
12597 }
12598#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012599 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12600 return 0;
12601 }
12602
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012603 while (i < len) {
12604 ch = wstr[i++];
12605#if SIZEOF_WCHAR_T == 2
12606 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12607 && i < len
12608 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12609 {
12610 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12611 i++;
12612 }
12613#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012614 if (!_PyUnicode_IsXidContinue(ch)) {
12615 return 0;
12616 }
12617 }
12618 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012619_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012620 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012621}
12622
INADA Naoki3ae20562017-01-16 20:41:20 +090012623/*[clinic input]
12624str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012625
INADA Naoki3ae20562017-01-16 20:41:20 +090012626Return True if the string is a valid Python identifier, False otherwise.
12627
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012628Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012629such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012630[clinic start generated code]*/
12631
12632static PyObject *
12633unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012634/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012635{
12636 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12637}
12638
INADA Naoki3ae20562017-01-16 20:41:20 +090012639/*[clinic input]
12640str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012641
INADA Naoki3ae20562017-01-16 20:41:20 +090012642Return True if the string is printable, False otherwise.
12643
12644A string is printable if all of its characters are considered printable in
12645repr() or if it is empty.
12646[clinic start generated code]*/
12647
12648static PyObject *
12649unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012650/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012651{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 Py_ssize_t i, length;
12653 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012654 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655
12656 if (PyUnicode_READY(self) == -1)
12657 return NULL;
12658 length = PyUnicode_GET_LENGTH(self);
12659 kind = PyUnicode_KIND(self);
12660 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012661
12662 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 if (length == 1)
12664 return PyBool_FromLong(
12665 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 for (i = 0; i < length; i++) {
12668 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012669 Py_RETURN_FALSE;
12670 }
12671 }
12672 Py_RETURN_TRUE;
12673}
12674
INADA Naoki3ae20562017-01-16 20:41:20 +090012675/*[clinic input]
12676str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677
INADA Naoki3ae20562017-01-16 20:41:20 +090012678 iterable: object
12679 /
12680
12681Concatenate any number of strings.
12682
Martin Panter91a88662017-01-24 00:30:06 +000012683The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012684The result is returned as a new string.
12685
12686Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12687[clinic start generated code]*/
12688
12689static PyObject *
12690unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012691/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692{
INADA Naoki3ae20562017-01-16 20:41:20 +090012693 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694}
12695
Martin v. Löwis18e16552006-02-15 17:27:45 +000012696static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012697unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 if (PyUnicode_READY(self) == -1)
12700 return -1;
12701 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702}
12703
INADA Naoki3ae20562017-01-16 20:41:20 +090012704/*[clinic input]
12705str.ljust as unicode_ljust
12706
12707 width: Py_ssize_t
12708 fillchar: Py_UCS4 = ' '
12709 /
12710
12711Return a left-justified string of length width.
12712
12713Padding is done using the specified fill character (default is a space).
12714[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715
12716static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012717unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12718/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012720 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012721 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722
Victor Stinnerc4b49542011-12-11 22:44:26 +010012723 if (PyUnicode_GET_LENGTH(self) >= width)
12724 return unicode_result_unchanged(self);
12725
12726 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727}
12728
INADA Naoki3ae20562017-01-16 20:41:20 +090012729/*[clinic input]
12730str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731
INADA Naoki3ae20562017-01-16 20:41:20 +090012732Return a copy of the string converted to lowercase.
12733[clinic start generated code]*/
12734
12735static PyObject *
12736unicode_lower_impl(PyObject *self)
12737/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012739 if (PyUnicode_READY(self) == -1)
12740 return NULL;
12741 if (PyUnicode_IS_ASCII(self))
12742 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012743 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744}
12745
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012746#define LEFTSTRIP 0
12747#define RIGHTSTRIP 1
12748#define BOTHSTRIP 2
12749
12750/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012751static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012752
INADA Naoki3ae20562017-01-16 20:41:20 +090012753#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012754
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012755/* externally visible for str.strip(unicode) */
12756PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012757_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012758{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012759 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760 int kind;
12761 Py_ssize_t i, j, len;
12762 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012763 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12766 return NULL;
12767
12768 kind = PyUnicode_KIND(self);
12769 data = PyUnicode_DATA(self);
12770 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012771 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12773 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012774 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012775
Benjamin Peterson14339b62009-01-31 16:36:08 +000012776 i = 0;
12777 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012778 while (i < len) {
12779 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12780 if (!BLOOM(sepmask, ch))
12781 break;
12782 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12783 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012784 i++;
12785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012786 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012787
Benjamin Peterson14339b62009-01-31 16:36:08 +000012788 j = len;
12789 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012790 j--;
12791 while (j >= i) {
12792 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12793 if (!BLOOM(sepmask, ch))
12794 break;
12795 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12796 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012798 }
12799
Benjamin Peterson29060642009-01-31 22:14:21 +000012800 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012801 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012802
Victor Stinner7931d9a2011-11-04 00:22:48 +010012803 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804}
12805
12806PyObject*
12807PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12808{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012809 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012810 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012811 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812
Victor Stinnerde636f32011-10-01 03:55:54 +020012813 if (PyUnicode_READY(self) == -1)
12814 return NULL;
12815
Victor Stinner684d5fd2012-05-03 02:32:34 +020012816 length = PyUnicode_GET_LENGTH(self);
12817 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012818
Victor Stinner684d5fd2012-05-03 02:32:34 +020012819 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012820 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821
Victor Stinnerde636f32011-10-01 03:55:54 +020012822 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012823 PyErr_SetString(PyExc_IndexError, "string index out of range");
12824 return NULL;
12825 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012826 if (start >= length || end < start)
12827 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012828
Victor Stinner684d5fd2012-05-03 02:32:34 +020012829 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012830 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012831 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012832 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012833 }
12834 else {
12835 kind = PyUnicode_KIND(self);
12836 data = PyUnicode_1BYTE_DATA(self);
12837 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012838 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012839 length);
12840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842
12843static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012844do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846 Py_ssize_t len, i, j;
12847
12848 if (PyUnicode_READY(self) == -1)
12849 return NULL;
12850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012852
Victor Stinnercc7af722013-04-09 22:39:24 +020012853 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012854 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012855
12856 i = 0;
12857 if (striptype != RIGHTSTRIP) {
12858 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012859 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012860 if (!_Py_ascii_whitespace[ch])
12861 break;
12862 i++;
12863 }
12864 }
12865
12866 j = len;
12867 if (striptype != LEFTSTRIP) {
12868 j--;
12869 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012870 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012871 if (!_Py_ascii_whitespace[ch])
12872 break;
12873 j--;
12874 }
12875 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012876 }
12877 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012878 else {
12879 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012880 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012881
Victor Stinnercc7af722013-04-09 22:39:24 +020012882 i = 0;
12883 if (striptype != RIGHTSTRIP) {
12884 while (i < len) {
12885 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12886 if (!Py_UNICODE_ISSPACE(ch))
12887 break;
12888 i++;
12889 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012890 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012891
12892 j = len;
12893 if (striptype != LEFTSTRIP) {
12894 j--;
12895 while (j >= i) {
12896 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12897 if (!Py_UNICODE_ISSPACE(ch))
12898 break;
12899 j--;
12900 }
12901 j++;
12902 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012903 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012904
Victor Stinner7931d9a2011-11-04 00:22:48 +010012905 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906}
12907
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012908
12909static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012910do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012911{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012912 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012913 if (PyUnicode_Check(sep))
12914 return _PyUnicode_XStrip(self, striptype, sep);
12915 else {
12916 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012917 "%s arg must be None or str",
12918 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012919 return NULL;
12920 }
12921 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012922
Benjamin Peterson14339b62009-01-31 16:36:08 +000012923 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012924}
12925
12926
INADA Naoki3ae20562017-01-16 20:41:20 +090012927/*[clinic input]
12928str.strip as unicode_strip
12929
12930 chars: object = None
12931 /
12932
Zachary Ware09895c22019-10-09 16:09:00 -050012933Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012934
12935If chars is given and not None, remove characters in chars instead.
12936[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012937
12938static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012939unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012940/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012941{
INADA Naoki3ae20562017-01-16 20:41:20 +090012942 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012943}
12944
12945
INADA Naoki3ae20562017-01-16 20:41:20 +090012946/*[clinic input]
12947str.lstrip as unicode_lstrip
12948
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012949 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012950 /
12951
12952Return a copy of the string with leading whitespace removed.
12953
12954If chars is given and not None, remove characters in chars instead.
12955[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012956
12957static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012958unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012959/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012960{
INADA Naoki3ae20562017-01-16 20:41:20 +090012961 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012962}
12963
12964
INADA Naoki3ae20562017-01-16 20:41:20 +090012965/*[clinic input]
12966str.rstrip as unicode_rstrip
12967
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012968 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012969 /
12970
12971Return a copy of the string with trailing whitespace removed.
12972
12973If chars is given and not None, remove characters in chars instead.
12974[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012975
12976static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012977unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012978/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012979{
INADA Naoki3ae20562017-01-16 20:41:20 +090012980 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012981}
12982
12983
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012985unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012987 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012989
Serhiy Storchaka05997252013-01-26 12:14:02 +020012990 if (len < 1)
12991 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992
Victor Stinnerc4b49542011-12-11 22:44:26 +010012993 /* no repeat, return original string */
12994 if (len == 1)
12995 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012996
Benjamin Petersonbac79492012-01-14 13:34:47 -050012997 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 return NULL;
12999
Victor Stinnerc759f3e2011-10-01 03:09:58 +020013000 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020013001 PyErr_SetString(PyExc_OverflowError,
13002 "repeated string is too long");
13003 return NULL;
13004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020013006
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013007 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008 if (!u)
13009 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020013010 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013013 int kind = PyUnicode_KIND(str);
13014 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010013015 if (kind == PyUnicode_1BYTE_KIND) {
13016 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020013017 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010013018 }
13019 else if (kind == PyUnicode_2BYTE_KIND) {
13020 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020013021 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010013022 ucs2[n] = fill_char;
13023 } else {
13024 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
13025 assert(kind == PyUnicode_4BYTE_KIND);
13026 for (n = 0; n < len; ++n)
13027 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020013028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013029 }
13030 else {
13031 /* number of characters copied this far */
13032 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013033 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013034 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020013035 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000013037 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020013039 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013040 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000013041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000013042 }
13043
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013044 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013045 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046}
13047
Alexander Belopolsky40018472011-02-26 01:02:56 +000013048PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013049PyUnicode_Replace(PyObject *str,
13050 PyObject *substr,
13051 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000013052 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013054 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13055 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013056 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013057 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058}
13059
INADA Naoki3ae20562017-01-16 20:41:20 +090013060/*[clinic input]
13061str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062
INADA Naoki3ae20562017-01-16 20:41:20 +090013063 old: unicode
13064 new: unicode
13065 count: Py_ssize_t = -1
13066 Maximum number of occurrences to replace.
13067 -1 (the default value) means replace all occurrences.
13068 /
13069
13070Return a copy with all occurrences of substring old replaced by new.
13071
13072If the optional argument count is given, only the first count occurrences are
13073replaced.
13074[clinic start generated code]*/
13075
13076static PyObject *
13077unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13078 Py_ssize_t count)
13079/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080{
Benjamin Peterson22a29702012-01-02 09:00:30 -060013081 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013082 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090013083 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084}
13085
sweeneydea81849b2020-04-22 17:05:48 -040013086/*[clinic input]
13087str.removeprefix as unicode_removeprefix
13088
13089 prefix: unicode
13090 /
13091
13092Return a str with the given prefix string removed if present.
13093
13094If the string starts with the prefix string, return string[len(prefix):].
13095Otherwise, return a copy of the original string.
13096[clinic start generated code]*/
13097
13098static PyObject *
13099unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13100/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13101{
13102 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13103 if (match == -1) {
13104 return NULL;
13105 }
13106 if (match) {
13107 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13108 PyUnicode_GET_LENGTH(self));
13109 }
13110 return unicode_result_unchanged(self);
13111}
13112
13113/*[clinic input]
13114str.removesuffix as unicode_removesuffix
13115
13116 suffix: unicode
13117 /
13118
13119Return a str with the given suffix string removed if present.
13120
13121If the string ends with the suffix string and that suffix is not empty,
13122return string[:-len(suffix)]. Otherwise, return a copy of the original
13123string.
13124[clinic start generated code]*/
13125
13126static PyObject *
13127unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13128/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13129{
13130 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13131 if (match == -1) {
13132 return NULL;
13133 }
13134 if (match) {
13135 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13136 - PyUnicode_GET_LENGTH(suffix));
13137 }
13138 return unicode_result_unchanged(self);
13139}
13140
Alexander Belopolsky40018472011-02-26 01:02:56 +000013141static PyObject *
13142unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013144 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 Py_ssize_t isize;
13146 Py_ssize_t osize, squote, dquote, i, o;
13147 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013148 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013149 const void *idata;
13150 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013153 return NULL;
13154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013155 isize = PyUnicode_GET_LENGTH(unicode);
13156 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 /* Compute length of output, quote characters, and
13159 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013160 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161 max = 127;
13162 squote = dquote = 0;
13163 ikind = PyUnicode_KIND(unicode);
13164 for (i = 0; i < isize; i++) {
13165 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013166 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013167 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013168 case '\'': squote++; break;
13169 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013170 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013171 incr = 2;
13172 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 default:
13174 /* Fast-path ASCII */
13175 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013176 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013178 ;
13179 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013180 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013181 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013182 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013184 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013185 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013186 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013187 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013188 if (osize > PY_SSIZE_T_MAX - incr) {
13189 PyErr_SetString(PyExc_OverflowError,
13190 "string is too long to generate repr");
13191 return NULL;
13192 }
13193 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013194 }
13195
13196 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013197 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013199 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 if (dquote)
13201 /* Both squote and dquote present. Use squote,
13202 and escape them */
13203 osize += squote;
13204 else
13205 quote = '"';
13206 }
Victor Stinner55c08782013-04-14 18:45:39 +020013207 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208
13209 repr = PyUnicode_New(osize, max);
13210 if (repr == NULL)
13211 return NULL;
13212 okind = PyUnicode_KIND(repr);
13213 odata = PyUnicode_DATA(repr);
13214
13215 PyUnicode_WRITE(okind, odata, 0, quote);
13216 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013217 if (unchanged) {
13218 _PyUnicode_FastCopyCharacters(repr, 1,
13219 unicode, 0,
13220 isize);
13221 }
13222 else {
13223 for (i = 0, o = 1; i < isize; i++) {
13224 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013225
Victor Stinner55c08782013-04-14 18:45:39 +020013226 /* Escape quotes and backslashes */
13227 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013228 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013229 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013230 continue;
13231 }
13232
13233 /* Map special whitespace to '\t', \n', '\r' */
13234 if (ch == '\t') {
13235 PyUnicode_WRITE(okind, odata, o++, '\\');
13236 PyUnicode_WRITE(okind, odata, o++, 't');
13237 }
13238 else if (ch == '\n') {
13239 PyUnicode_WRITE(okind, odata, o++, '\\');
13240 PyUnicode_WRITE(okind, odata, o++, 'n');
13241 }
13242 else if (ch == '\r') {
13243 PyUnicode_WRITE(okind, odata, o++, '\\');
13244 PyUnicode_WRITE(okind, odata, o++, 'r');
13245 }
13246
13247 /* Map non-printable US ASCII to '\xhh' */
13248 else if (ch < ' ' || ch == 0x7F) {
13249 PyUnicode_WRITE(okind, odata, o++, '\\');
13250 PyUnicode_WRITE(okind, odata, o++, 'x');
13251 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13252 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13253 }
13254
13255 /* Copy ASCII characters as-is */
13256 else if (ch < 0x7F) {
13257 PyUnicode_WRITE(okind, odata, o++, ch);
13258 }
13259
13260 /* Non-ASCII characters */
13261 else {
13262 /* Map Unicode whitespace and control characters
13263 (categories Z* and C* except ASCII space)
13264 */
13265 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13266 PyUnicode_WRITE(okind, odata, o++, '\\');
13267 /* Map 8-bit characters to '\xhh' */
13268 if (ch <= 0xff) {
13269 PyUnicode_WRITE(okind, odata, o++, 'x');
13270 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13271 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13272 }
13273 /* Map 16-bit characters to '\uxxxx' */
13274 else if (ch <= 0xffff) {
13275 PyUnicode_WRITE(okind, odata, o++, 'u');
13276 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13277 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13278 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13279 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13280 }
13281 /* Map 21-bit characters to '\U00xxxxxx' */
13282 else {
13283 PyUnicode_WRITE(okind, odata, o++, 'U');
13284 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13285 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13286 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13287 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13288 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13289 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13290 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13291 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13292 }
13293 }
13294 /* Copy characters as-is */
13295 else {
13296 PyUnicode_WRITE(okind, odata, o++, ch);
13297 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013298 }
13299 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013302 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013303 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304}
13305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013306PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013307 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308\n\
13309Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013310such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311arguments start and end are interpreted as in slice notation.\n\
13312\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013313Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013314
13315static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013318 /* initialize variables to prevent gcc warning */
13319 PyObject *substring = NULL;
13320 Py_ssize_t start = 0;
13321 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013322 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013324 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013327 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013328 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013330 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013332 if (result == -2)
13333 return NULL;
13334
Christian Heimes217cfd12007-12-02 14:31:20 +000013335 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336}
13337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013338PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013339 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013340\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013341Return the highest index in S where substring sub is found,\n\
13342such that sub is contained within S[start:end]. Optional\n\
13343arguments start and end are interpreted as in slice notation.\n\
13344\n\
13345Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013346
13347static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013348unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013350 /* initialize variables to prevent gcc warning */
13351 PyObject *substring = NULL;
13352 Py_ssize_t start = 0;
13353 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013354 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013356 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013359 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013360 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013361
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013362 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013364 if (result == -2)
13365 return NULL;
13366
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367 if (result < 0) {
13368 PyErr_SetString(PyExc_ValueError, "substring not found");
13369 return NULL;
13370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371
Christian Heimes217cfd12007-12-02 14:31:20 +000013372 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373}
13374
INADA Naoki3ae20562017-01-16 20:41:20 +090013375/*[clinic input]
13376str.rjust as unicode_rjust
13377
13378 width: Py_ssize_t
13379 fillchar: Py_UCS4 = ' '
13380 /
13381
13382Return a right-justified string of length width.
13383
13384Padding is done using the specified fill character (default is a space).
13385[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013386
13387static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013388unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13389/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013390{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013391 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013392 return NULL;
13393
Victor Stinnerc4b49542011-12-11 22:44:26 +010013394 if (PyUnicode_GET_LENGTH(self) >= width)
13395 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013396
Victor Stinnerc4b49542011-12-11 22:44:26 +010013397 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398}
13399
Alexander Belopolsky40018472011-02-26 01:02:56 +000013400PyObject *
13401PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013402{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013403 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013405
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013406 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013407}
13408
INADA Naoki3ae20562017-01-16 20:41:20 +090013409/*[clinic input]
13410str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411
INADA Naoki3ae20562017-01-16 20:41:20 +090013412 sep: object = None
13413 The delimiter according which to split the string.
13414 None (the default value) means split according to any whitespace,
13415 and discard empty strings from the result.
13416 maxsplit: Py_ssize_t = -1
13417 Maximum number of splits to do.
13418 -1 (the default value) means no limit.
13419
13420Return a list of the words in the string, using sep as the delimiter string.
13421[clinic start generated code]*/
13422
13423static PyObject *
13424unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13425/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426{
INADA Naoki3ae20562017-01-16 20:41:20 +090013427 if (sep == Py_None)
13428 return split(self, NULL, maxsplit);
13429 if (PyUnicode_Check(sep))
13430 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013431
Victor Stinner998b8062018-09-12 00:23:25 +020013432 PyErr_Format(PyExc_TypeError,
13433 "must be str or None, not %.100s",
13434 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013435 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436}
13437
Thomas Wouters477c8d52006-05-27 19:21:47 +000013438PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013439PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013440{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013441 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013442 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013443 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013444 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013445
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013446 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013448
Victor Stinner14f8f022011-10-05 20:58:25 +020013449 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013451 len1 = PyUnicode_GET_LENGTH(str_obj);
13452 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013453 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013454 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013455 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013456 }
13457 buf1 = PyUnicode_DATA(str_obj);
13458 buf2 = PyUnicode_DATA(sep_obj);
13459 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013460 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013461 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013462 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013464
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013465 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013466 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013467 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13468 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13469 else
13470 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013471 break;
13472 case PyUnicode_2BYTE_KIND:
13473 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13474 break;
13475 case PyUnicode_4BYTE_KIND:
13476 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13477 break;
13478 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013479 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013480 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013481
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013482 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013483 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013484 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013485
13486 return out;
13487}
13488
13489
13490PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013491PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013492{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013493 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013494 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013495 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013496 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013497
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013498 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013499 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013500
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013501 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013502 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013503 len1 = PyUnicode_GET_LENGTH(str_obj);
13504 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013505 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013506 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013507 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013508 }
13509 buf1 = PyUnicode_DATA(str_obj);
13510 buf2 = PyUnicode_DATA(sep_obj);
13511 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013512 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013513 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013514 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013516
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013517 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013518 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013519 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13520 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13521 else
13522 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013523 break;
13524 case PyUnicode_2BYTE_KIND:
13525 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13526 break;
13527 case PyUnicode_4BYTE_KIND:
13528 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13529 break;
13530 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013531 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013532 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013533
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013534 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013535 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013536 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013537
13538 return out;
13539}
13540
INADA Naoki3ae20562017-01-16 20:41:20 +090013541/*[clinic input]
13542str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013543
INADA Naoki3ae20562017-01-16 20:41:20 +090013544 sep: object
13545 /
13546
13547Partition the string into three parts using the given separator.
13548
13549This will search for the separator in the string. If the separator is found,
13550returns a 3-tuple containing the part before the separator, the separator
13551itself, and the part after it.
13552
13553If the separator is not found, returns a 3-tuple containing the original string
13554and two empty strings.
13555[clinic start generated code]*/
13556
13557static PyObject *
13558unicode_partition(PyObject *self, PyObject *sep)
13559/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013560{
INADA Naoki3ae20562017-01-16 20:41:20 +090013561 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013562}
13563
INADA Naoki3ae20562017-01-16 20:41:20 +090013564/*[clinic input]
13565str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013566
INADA Naoki3ae20562017-01-16 20:41:20 +090013567Partition the string into three parts using the given separator.
13568
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013569This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013570the separator is found, returns a 3-tuple containing the part before the
13571separator, the separator itself, and the part after it.
13572
13573If the separator is not found, returns a 3-tuple containing two empty strings
13574and the original string.
13575[clinic start generated code]*/
13576
13577static PyObject *
13578unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013579/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013580{
INADA Naoki3ae20562017-01-16 20:41:20 +090013581 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013582}
13583
Alexander Belopolsky40018472011-02-26 01:02:56 +000013584PyObject *
13585PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013586{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013587 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013588 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013589
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013590 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013591}
13592
INADA Naoki3ae20562017-01-16 20:41:20 +090013593/*[clinic input]
13594str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013595
INADA Naoki3ae20562017-01-16 20:41:20 +090013596Return a list of the words in the string, using sep as the delimiter string.
13597
13598Splits are done starting at the end of the string and working to the front.
13599[clinic start generated code]*/
13600
13601static PyObject *
13602unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13603/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013604{
INADA Naoki3ae20562017-01-16 20:41:20 +090013605 if (sep == Py_None)
13606 return rsplit(self, NULL, maxsplit);
13607 if (PyUnicode_Check(sep))
13608 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013609
Victor Stinner998b8062018-09-12 00:23:25 +020013610 PyErr_Format(PyExc_TypeError,
13611 "must be str or None, not %.100s",
13612 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013613 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013614}
13615
INADA Naoki3ae20562017-01-16 20:41:20 +090013616/*[clinic input]
13617str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013618
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013619 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013620
13621Return a list of the lines in the string, breaking at line boundaries.
13622
13623Line breaks are not included in the resulting list unless keepends is given and
13624true.
13625[clinic start generated code]*/
13626
13627static PyObject *
13628unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013629/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013630{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013631 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013632}
13633
13634static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013635PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013636{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013637 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013638}
13639
INADA Naoki3ae20562017-01-16 20:41:20 +090013640/*[clinic input]
13641str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013642
INADA Naoki3ae20562017-01-16 20:41:20 +090013643Convert uppercase characters to lowercase and lowercase characters to uppercase.
13644[clinic start generated code]*/
13645
13646static PyObject *
13647unicode_swapcase_impl(PyObject *self)
13648/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013649{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013650 if (PyUnicode_READY(self) == -1)
13651 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013652 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013653}
13654
Larry Hastings61272b72014-01-07 12:41:53 -080013655/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013656
Larry Hastings31826802013-10-19 00:09:25 -070013657@staticmethod
13658str.maketrans as unicode_maketrans
13659
13660 x: object
13661
13662 y: unicode=NULL
13663
13664 z: unicode=NULL
13665
13666 /
13667
13668Return a translation table usable for str.translate().
13669
13670If there is only one argument, it must be a dictionary mapping Unicode
13671ordinals (integers) or characters to Unicode ordinals, strings or None.
13672Character keys will be then converted to ordinals.
13673If there are two arguments, they must be strings of equal length, and
13674in the resulting dictionary, each character in x will be mapped to the
13675character at the same position in y. If there is a third argument, it
13676must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013677[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013678
Larry Hastings31826802013-10-19 00:09:25 -070013679static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013680unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013681/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013682{
Georg Brandlceee0772007-11-27 23:48:05 +000013683 PyObject *new = NULL, *key, *value;
13684 Py_ssize_t i = 0;
13685 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013686
Georg Brandlceee0772007-11-27 23:48:05 +000013687 new = PyDict_New();
13688 if (!new)
13689 return NULL;
13690 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013691 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013692 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013693
Georg Brandlceee0772007-11-27 23:48:05 +000013694 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013695 if (!PyUnicode_Check(x)) {
13696 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13697 "be a string if there is a second argument");
13698 goto err;
13699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013700 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013701 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13702 "arguments must have equal length");
13703 goto err;
13704 }
13705 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013706 x_kind = PyUnicode_KIND(x);
13707 y_kind = PyUnicode_KIND(y);
13708 x_data = PyUnicode_DATA(x);
13709 y_data = PyUnicode_DATA(y);
13710 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13711 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013712 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013713 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013714 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013715 if (!value) {
13716 Py_DECREF(key);
13717 goto err;
13718 }
Georg Brandlceee0772007-11-27 23:48:05 +000013719 res = PyDict_SetItem(new, key, value);
13720 Py_DECREF(key);
13721 Py_DECREF(value);
13722 if (res < 0)
13723 goto err;
13724 }
13725 /* create entries for deleting chars in z */
13726 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013727 z_kind = PyUnicode_KIND(z);
13728 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013729 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013730 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013731 if (!key)
13732 goto err;
13733 res = PyDict_SetItem(new, key, Py_None);
13734 Py_DECREF(key);
13735 if (res < 0)
13736 goto err;
13737 }
13738 }
13739 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013740 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013741 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013742
Georg Brandlceee0772007-11-27 23:48:05 +000013743 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013744 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013745 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13746 "to maketrans it must be a dict");
13747 goto err;
13748 }
13749 /* copy entries into the new dict, converting string keys to int keys */
13750 while (PyDict_Next(x, &i, &key, &value)) {
13751 if (PyUnicode_Check(key)) {
13752 /* convert string keys to integer keys */
13753 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013754 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013755 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13756 "table must be of length 1");
13757 goto err;
13758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013759 kind = PyUnicode_KIND(key);
13760 data = PyUnicode_DATA(key);
13761 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013762 if (!newkey)
13763 goto err;
13764 res = PyDict_SetItem(new, newkey, value);
13765 Py_DECREF(newkey);
13766 if (res < 0)
13767 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013768 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013769 /* just keep integer keys */
13770 if (PyDict_SetItem(new, key, value) < 0)
13771 goto err;
13772 } else {
13773 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13774 "be strings or integers");
13775 goto err;
13776 }
13777 }
13778 }
13779 return new;
13780 err:
13781 Py_DECREF(new);
13782 return NULL;
13783}
13784
INADA Naoki3ae20562017-01-16 20:41:20 +090013785/*[clinic input]
13786str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013787
INADA Naoki3ae20562017-01-16 20:41:20 +090013788 table: object
13789 Translation table, which must be a mapping of Unicode ordinals to
13790 Unicode ordinals, strings, or None.
13791 /
13792
13793Replace each character in the string using the given translation table.
13794
13795The table must implement lookup/indexing via __getitem__, for instance a
13796dictionary or list. If this operation raises LookupError, the character is
13797left untouched. Characters mapped to None are deleted.
13798[clinic start generated code]*/
13799
13800static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013801unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013802/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013804 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013805}
13806
INADA Naoki3ae20562017-01-16 20:41:20 +090013807/*[clinic input]
13808str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013809
INADA Naoki3ae20562017-01-16 20:41:20 +090013810Return a copy of the string converted to uppercase.
13811[clinic start generated code]*/
13812
13813static PyObject *
13814unicode_upper_impl(PyObject *self)
13815/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013816{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013817 if (PyUnicode_READY(self) == -1)
13818 return NULL;
13819 if (PyUnicode_IS_ASCII(self))
13820 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013821 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013822}
13823
INADA Naoki3ae20562017-01-16 20:41:20 +090013824/*[clinic input]
13825str.zfill as unicode_zfill
13826
13827 width: Py_ssize_t
13828 /
13829
13830Pad a numeric string with zeros on the left, to fill a field of the given width.
13831
13832The string is never truncated.
13833[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013834
13835static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013836unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013837/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013838{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013839 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013840 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013841 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013842 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013843 Py_UCS4 chr;
13844
Benjamin Petersonbac79492012-01-14 13:34:47 -050013845 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013846 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013847
Victor Stinnerc4b49542011-12-11 22:44:26 +010013848 if (PyUnicode_GET_LENGTH(self) >= width)
13849 return unicode_result_unchanged(self);
13850
13851 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013852
13853 u = pad(self, fill, 0, '0');
13854
Walter Dörwald068325e2002-04-15 13:36:47 +000013855 if (u == NULL)
13856 return NULL;
13857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013858 kind = PyUnicode_KIND(u);
13859 data = PyUnicode_DATA(u);
13860 chr = PyUnicode_READ(kind, data, fill);
13861
13862 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013863 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013864 PyUnicode_WRITE(kind, data, 0, chr);
13865 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013866 }
13867
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013868 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013869 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013870}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013871
13872#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013873static PyObject *
13874unicode__decimal2ascii(PyObject *self)
13875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013876 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013877}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013878#endif
13879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013880PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013881 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013882\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013883Return True if S starts with the specified prefix, False otherwise.\n\
13884With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013885With optional end, stop comparing S at that position.\n\
13886prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013887
13888static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013889unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013890 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013891{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013892 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013893 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013894 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013895 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013896 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013897
Jesus Ceaac451502011-04-20 17:09:23 +020013898 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013899 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013900 if (PyTuple_Check(subobj)) {
13901 Py_ssize_t i;
13902 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013903 substring = PyTuple_GET_ITEM(subobj, i);
13904 if (!PyUnicode_Check(substring)) {
13905 PyErr_Format(PyExc_TypeError,
13906 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013907 "not %.100s",
13908 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013909 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013910 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013911 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013912 if (result == -1)
13913 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013914 if (result) {
13915 Py_RETURN_TRUE;
13916 }
13917 }
13918 /* nothing matched */
13919 Py_RETURN_FALSE;
13920 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013921 if (!PyUnicode_Check(subobj)) {
13922 PyErr_Format(PyExc_TypeError,
13923 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013924 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013925 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013926 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013927 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013928 if (result == -1)
13929 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013930 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013931}
13932
13933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013934PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013935 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013936\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013937Return True if S ends with the specified suffix, False otherwise.\n\
13938With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013939With optional end, stop comparing S at that position.\n\
13940suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013941
13942static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013943unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013944 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013945{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013946 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013947 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013948 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013949 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013950 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951
Jesus Ceaac451502011-04-20 17:09:23 +020013952 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013953 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013954 if (PyTuple_Check(subobj)) {
13955 Py_ssize_t i;
13956 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013957 substring = PyTuple_GET_ITEM(subobj, i);
13958 if (!PyUnicode_Check(substring)) {
13959 PyErr_Format(PyExc_TypeError,
13960 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013961 "not %.100s",
13962 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013963 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013964 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013965 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013966 if (result == -1)
13967 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013968 if (result) {
13969 Py_RETURN_TRUE;
13970 }
13971 }
13972 Py_RETURN_FALSE;
13973 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013974 if (!PyUnicode_Check(subobj)) {
13975 PyErr_Format(PyExc_TypeError,
13976 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013977 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013978 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013979 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013980 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013981 if (result == -1)
13982 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013983 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013984}
13985
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013986static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013987_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013988{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013989 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13990 writer->data = PyUnicode_DATA(writer->buffer);
13991
13992 if (!writer->readonly) {
13993 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013994 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013995 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013996 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013997 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13998 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13999 writer->kind = PyUnicode_WCHAR_KIND;
14000 assert(writer->kind <= PyUnicode_1BYTE_KIND);
14001
Victor Stinner8f674cc2013-04-17 23:02:17 +020014002 /* Copy-on-write mode: set buffer size to 0 so
14003 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
14004 * next write. */
14005 writer->size = 0;
14006 }
Victor Stinner202fdca2012-05-07 12:47:02 +020014007}
14008
Victor Stinnerd3f08822012-05-29 12:57:52 +020014009void
Victor Stinner8f674cc2013-04-17 23:02:17 +020014010_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014011{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014012 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020014013
14014 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020014015 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020014016
14017 /* use a value smaller than PyUnicode_1BYTE_KIND() so
14018 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14019 writer->kind = PyUnicode_WCHAR_KIND;
14020 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020014021}
14022
Inada Naoki770847a2019-06-24 12:30:24 +090014023// Initialize _PyUnicodeWriter with initial buffer
14024static inline void
14025_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
14026{
14027 memset(writer, 0, sizeof(*writer));
14028 writer->buffer = buffer;
14029 _PyUnicodeWriter_Update(writer);
14030 writer->min_length = writer->size;
14031}
14032
Victor Stinnerd3f08822012-05-29 12:57:52 +020014033int
14034_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
14035 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020014036{
14037 Py_ssize_t newlen;
14038 PyObject *newbuffer;
14039
Victor Stinner2740e462016-09-06 16:58:36 -070014040 assert(maxchar <= MAX_UNICODE);
14041
Victor Stinnerca9381e2015-09-22 00:58:32 +020014042 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020014043 assert((maxchar > writer->maxchar && length >= 0)
14044 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014045
Victor Stinner202fdca2012-05-07 12:47:02 +020014046 if (length > PY_SSIZE_T_MAX - writer->pos) {
14047 PyErr_NoMemory();
14048 return -1;
14049 }
14050 newlen = writer->pos + length;
14051
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014052 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020014053
Victor Stinnerd3f08822012-05-29 12:57:52 +020014054 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020014055 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010014056 if (writer->overallocate
14057 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14058 /* overallocate to limit the number of realloc() */
14059 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014060 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014061 if (newlen < writer->min_length)
14062 newlen = writer->min_length;
14063
Victor Stinnerd3f08822012-05-29 12:57:52 +020014064 writer->buffer = PyUnicode_New(newlen, maxchar);
14065 if (writer->buffer == NULL)
14066 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014067 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014068 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010014069 if (writer->overallocate
14070 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14071 /* overallocate to limit the number of realloc() */
14072 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014073 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014074 if (newlen < writer->min_length)
14075 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014076
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014077 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020014078 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030014079 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020014080 newbuffer = PyUnicode_New(newlen, maxchar);
14081 if (newbuffer == NULL)
14082 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014083 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14084 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020014085 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014086 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020014087 }
14088 else {
14089 newbuffer = resize_compact(writer->buffer, newlen);
14090 if (newbuffer == NULL)
14091 return -1;
14092 }
14093 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014094 }
14095 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014096 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014097 newbuffer = PyUnicode_New(writer->size, maxchar);
14098 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014099 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014100 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14101 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014102 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014103 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014104 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014105 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014106
14107#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014108}
14109
Victor Stinnerca9381e2015-09-22 00:58:32 +020014110int
14111_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14112 enum PyUnicode_Kind kind)
14113{
14114 Py_UCS4 maxchar;
14115
14116 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14117 assert(writer->kind < kind);
14118
14119 switch (kind)
14120 {
14121 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14122 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
Victor Stinner99768342021-03-17 21:46:53 +010014123 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
Victor Stinnerca9381e2015-09-22 00:58:32 +020014124 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014125 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014126 }
14127
14128 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14129}
14130
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014131static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014132_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014133{
Victor Stinner2740e462016-09-06 16:58:36 -070014134 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014135 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14136 return -1;
14137 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14138 writer->pos++;
14139 return 0;
14140}
14141
14142int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014143_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14144{
14145 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14146}
14147
14148int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014149_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14150{
14151 Py_UCS4 maxchar;
14152 Py_ssize_t len;
14153
14154 if (PyUnicode_READY(str) == -1)
14155 return -1;
14156 len = PyUnicode_GET_LENGTH(str);
14157 if (len == 0)
14158 return 0;
14159 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14160 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014161 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014162 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014163 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014164 Py_INCREF(str);
14165 writer->buffer = str;
14166 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014167 writer->pos += len;
14168 return 0;
14169 }
14170 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14171 return -1;
14172 }
14173 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14174 str, 0, len);
14175 writer->pos += len;
14176 return 0;
14177}
14178
Victor Stinnere215d962012-10-06 23:03:36 +020014179int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014180_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14181 Py_ssize_t start, Py_ssize_t end)
14182{
14183 Py_UCS4 maxchar;
14184 Py_ssize_t len;
14185
14186 if (PyUnicode_READY(str) == -1)
14187 return -1;
14188
14189 assert(0 <= start);
14190 assert(end <= PyUnicode_GET_LENGTH(str));
14191 assert(start <= end);
14192
14193 if (end == 0)
14194 return 0;
14195
14196 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14197 return _PyUnicodeWriter_WriteStr(writer, str);
14198
14199 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14200 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14201 else
14202 maxchar = writer->maxchar;
14203 len = end - start;
14204
14205 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14206 return -1;
14207
14208 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14209 str, start, len);
14210 writer->pos += len;
14211 return 0;
14212}
14213
14214int
Victor Stinner4a587072013-11-19 12:54:53 +010014215_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14216 const char *ascii, Py_ssize_t len)
14217{
14218 if (len == -1)
14219 len = strlen(ascii);
14220
Andy Lestere6be9b52020-02-11 20:28:35 -060014221 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014222
14223 if (writer->buffer == NULL && !writer->overallocate) {
14224 PyObject *str;
14225
14226 str = _PyUnicode_FromASCII(ascii, len);
14227 if (str == NULL)
14228 return -1;
14229
14230 writer->readonly = 1;
14231 writer->buffer = str;
14232 _PyUnicodeWriter_Update(writer);
14233 writer->pos += len;
14234 return 0;
14235 }
14236
14237 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14238 return -1;
14239
14240 switch (writer->kind)
14241 {
14242 case PyUnicode_1BYTE_KIND:
14243 {
14244 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14245 Py_UCS1 *data = writer->data;
14246
Christian Heimesf051e432016-09-13 20:22:02 +020014247 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014248 break;
14249 }
14250 case PyUnicode_2BYTE_KIND:
14251 {
14252 _PyUnicode_CONVERT_BYTES(
14253 Py_UCS1, Py_UCS2,
14254 ascii, ascii + len,
14255 (Py_UCS2 *)writer->data + writer->pos);
14256 break;
14257 }
14258 case PyUnicode_4BYTE_KIND:
14259 {
14260 _PyUnicode_CONVERT_BYTES(
14261 Py_UCS1, Py_UCS4,
14262 ascii, ascii + len,
14263 (Py_UCS4 *)writer->data + writer->pos);
14264 break;
14265 }
14266 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014267 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014268 }
14269
14270 writer->pos += len;
14271 return 0;
14272}
14273
14274int
14275_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14276 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014277{
14278 Py_UCS4 maxchar;
14279
Andy Lestere6be9b52020-02-11 20:28:35 -060014280 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014281 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14282 return -1;
14283 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14284 writer->pos += len;
14285 return 0;
14286}
14287
Victor Stinnerd3f08822012-05-29 12:57:52 +020014288PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014289_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014290{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014291 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014292
Victor Stinnerd3f08822012-05-29 12:57:52 +020014293 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014294 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014295 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014296 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014297
14298 str = writer->buffer;
14299 writer->buffer = NULL;
14300
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014301 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014302 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14303 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014304 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014305
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014306 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14307 PyObject *str2;
14308 str2 = resize_compact(str, writer->pos);
14309 if (str2 == NULL) {
14310 Py_DECREF(str);
14311 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014312 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014313 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014314 }
14315
Victor Stinner15a0bd32013-07-08 22:29:55 +020014316 assert(_PyUnicode_CheckConsistency(str, 1));
14317 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014318}
14319
Victor Stinnerd3f08822012-05-29 12:57:52 +020014320void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014321_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014322{
14323 Py_CLEAR(writer->buffer);
14324}
14325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014326#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014327
14328PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014329 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014330\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014331Return a formatted version of S, using substitutions from args and kwargs.\n\
14332The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014333
Eric Smith27bbca62010-11-04 17:06:58 +000014334PyDoc_STRVAR(format_map__doc__,
14335 "S.format_map(mapping) -> str\n\
14336\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014337Return a formatted version of S, using substitutions from mapping.\n\
14338The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014339
INADA Naoki3ae20562017-01-16 20:41:20 +090014340/*[clinic input]
14341str.__format__ as unicode___format__
14342
14343 format_spec: unicode
14344 /
14345
14346Return a formatted version of the string as described by format_spec.
14347[clinic start generated code]*/
14348
Eric Smith4a7d76d2008-05-30 18:10:19 +000014349static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014350unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014351/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014352{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014353 _PyUnicodeWriter writer;
14354 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014355
Victor Stinnerd3f08822012-05-29 12:57:52 +020014356 if (PyUnicode_READY(self) == -1)
14357 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014358 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014359 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14360 self, format_spec, 0,
14361 PyUnicode_GET_LENGTH(format_spec));
14362 if (ret == -1) {
14363 _PyUnicodeWriter_Dealloc(&writer);
14364 return NULL;
14365 }
14366 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014367}
14368
INADA Naoki3ae20562017-01-16 20:41:20 +090014369/*[clinic input]
14370str.__sizeof__ as unicode_sizeof
14371
14372Return the size of the string in memory, in bytes.
14373[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014374
14375static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014376unicode_sizeof_impl(PyObject *self)
14377/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014378{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014379 Py_ssize_t size;
14380
14381 /* If it's a compact object, account for base structure +
14382 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014383 if (PyUnicode_IS_COMPACT_ASCII(self))
14384 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14385 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014386 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014387 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014388 else {
14389 /* If it is a two-block object, account for base object, and
14390 for character block if present. */
14391 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014392 if (_PyUnicode_DATA_ANY(self))
14393 size += (PyUnicode_GET_LENGTH(self) + 1) *
14394 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014395 }
14396 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014397 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014398 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14399 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14400 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14401 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014402
14403 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014404}
14405
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014406static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014407unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014408{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014409 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014410 if (!copy)
14411 return NULL;
14412 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014413}
14414
Guido van Rossumd57fd912000-03-10 22:53:23 +000014415static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014416 UNICODE_ENCODE_METHODDEF
14417 UNICODE_REPLACE_METHODDEF
14418 UNICODE_SPLIT_METHODDEF
14419 UNICODE_RSPLIT_METHODDEF
14420 UNICODE_JOIN_METHODDEF
14421 UNICODE_CAPITALIZE_METHODDEF
14422 UNICODE_CASEFOLD_METHODDEF
14423 UNICODE_TITLE_METHODDEF
14424 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014425 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014426 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014427 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014428 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014429 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014430 UNICODE_LJUST_METHODDEF
14431 UNICODE_LOWER_METHODDEF
14432 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014433 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14434 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014435 UNICODE_RJUST_METHODDEF
14436 UNICODE_RSTRIP_METHODDEF
14437 UNICODE_RPARTITION_METHODDEF
14438 UNICODE_SPLITLINES_METHODDEF
14439 UNICODE_STRIP_METHODDEF
14440 UNICODE_SWAPCASE_METHODDEF
14441 UNICODE_TRANSLATE_METHODDEF
14442 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014443 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14444 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014445 UNICODE_REMOVEPREFIX_METHODDEF
14446 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014447 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014448 UNICODE_ISLOWER_METHODDEF
14449 UNICODE_ISUPPER_METHODDEF
14450 UNICODE_ISTITLE_METHODDEF
14451 UNICODE_ISSPACE_METHODDEF
14452 UNICODE_ISDECIMAL_METHODDEF
14453 UNICODE_ISDIGIT_METHODDEF
14454 UNICODE_ISNUMERIC_METHODDEF
14455 UNICODE_ISALPHA_METHODDEF
14456 UNICODE_ISALNUM_METHODDEF
14457 UNICODE_ISIDENTIFIER_METHODDEF
14458 UNICODE_ISPRINTABLE_METHODDEF
14459 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014460 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014461 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014462 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014463 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014464 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014465#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014466 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014467 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014468#endif
14469
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014470 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014471 {NULL, NULL}
14472};
14473
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014474static PyObject *
14475unicode_mod(PyObject *v, PyObject *w)
14476{
Brian Curtindfc80e32011-08-10 20:28:54 -050014477 if (!PyUnicode_Check(v))
14478 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014479 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014480}
14481
14482static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014483 0, /*nb_add*/
14484 0, /*nb_subtract*/
14485 0, /*nb_multiply*/
14486 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014487};
14488
Guido van Rossumd57fd912000-03-10 22:53:23 +000014489static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014490 (lenfunc) unicode_length, /* sq_length */
14491 PyUnicode_Concat, /* sq_concat */
14492 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14493 (ssizeargfunc) unicode_getitem, /* sq_item */
14494 0, /* sq_slice */
14495 0, /* sq_ass_item */
14496 0, /* sq_ass_slice */
14497 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014498};
14499
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014500static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014501unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014502{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014503 if (PyUnicode_READY(self) == -1)
14504 return NULL;
14505
Victor Stinnera15e2602020-04-08 02:01:56 +020014506 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014507 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014508 if (i == -1 && PyErr_Occurred())
14509 return NULL;
14510 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014511 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014512 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014513 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014514 Py_ssize_t start, stop, step, slicelength, i;
14515 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014516 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014517 const void *src_data;
14518 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014519 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014520 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014521
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014522 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014523 return NULL;
14524 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014525 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14526 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014527
14528 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014529 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014530 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014531 slicelength == PyUnicode_GET_LENGTH(self)) {
14532 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014533 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014534 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014535 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014536 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014537 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014538 src_kind = PyUnicode_KIND(self);
14539 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014540 if (!PyUnicode_IS_ASCII(self)) {
14541 kind_limit = kind_maxchar_limit(src_kind);
14542 max_char = 0;
14543 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14544 ch = PyUnicode_READ(src_kind, src_data, cur);
14545 if (ch > max_char) {
14546 max_char = ch;
14547 if (max_char >= kind_limit)
14548 break;
14549 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014550 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014551 }
Victor Stinner55c99112011-10-13 01:17:06 +020014552 else
14553 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014554 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014555 if (result == NULL)
14556 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014557 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014558 dest_data = PyUnicode_DATA(result);
14559
14560 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014561 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14562 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014563 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014564 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014565 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014566 } else {
14567 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14568 return NULL;
14569 }
14570}
14571
14572static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014573 (lenfunc)unicode_length, /* mp_length */
14574 (binaryfunc)unicode_subscript, /* mp_subscript */
14575 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014576};
14577
Guido van Rossumd57fd912000-03-10 22:53:23 +000014578
Guido van Rossumd57fd912000-03-10 22:53:23 +000014579/* Helpers for PyUnicode_Format() */
14580
Victor Stinnera47082312012-10-04 02:19:54 +020014581struct unicode_formatter_t {
14582 PyObject *args;
14583 int args_owned;
14584 Py_ssize_t arglen, argidx;
14585 PyObject *dict;
14586
14587 enum PyUnicode_Kind fmtkind;
14588 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014589 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014590 PyObject *fmtstr;
14591
14592 _PyUnicodeWriter writer;
14593};
14594
14595struct unicode_format_arg_t {
14596 Py_UCS4 ch;
14597 int flags;
14598 Py_ssize_t width;
14599 int prec;
14600 int sign;
14601};
14602
Guido van Rossumd57fd912000-03-10 22:53:23 +000014603static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014604unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014605{
Victor Stinnera47082312012-10-04 02:19:54 +020014606 Py_ssize_t argidx = ctx->argidx;
14607
14608 if (argidx < ctx->arglen) {
14609 ctx->argidx++;
14610 if (ctx->arglen < 0)
14611 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014612 else
Victor Stinnera47082312012-10-04 02:19:54 +020014613 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014614 }
14615 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014616 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014617 return NULL;
14618}
14619
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014620/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014621
Victor Stinnera47082312012-10-04 02:19:54 +020014622/* Format a float into the writer if the writer is not NULL, or into *p_output
14623 otherwise.
14624
14625 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014626static int
Victor Stinnera47082312012-10-04 02:19:54 +020014627formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14628 PyObject **p_output,
14629 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014630{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014631 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014632 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014633 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014634 int prec;
14635 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014636
Guido van Rossumd57fd912000-03-10 22:53:23 +000014637 x = PyFloat_AsDouble(v);
14638 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014639 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014640
Victor Stinnera47082312012-10-04 02:19:54 +020014641 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014642 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014643 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014644
Victor Stinnera47082312012-10-04 02:19:54 +020014645 if (arg->flags & F_ALT)
14646 dtoa_flags = Py_DTSF_ALT;
14647 else
14648 dtoa_flags = 0;
14649 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014650 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014651 return -1;
14652 len = strlen(p);
14653 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014654 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014655 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014656 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014657 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014658 }
14659 else
14660 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014661 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014662 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014663}
14664
Victor Stinnerd0880d52012-04-27 23:40:13 +020014665/* formatlong() emulates the format codes d, u, o, x and X, and
14666 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14667 * Python's regular ints.
14668 * Return value: a new PyUnicodeObject*, or NULL if error.
14669 * The output string is of the form
14670 * "-"? ("0x" | "0X")? digit+
14671 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14672 * set in flags. The case of hex digits will be correct,
14673 * There will be at least prec digits, zero-filled on the left if
14674 * necessary to get that many.
14675 * val object to be converted
14676 * flags bitmask of format flags; only F_ALT is looked at
14677 * prec minimum number of digits; 0-fill on left if needed
14678 * type a character in [duoxX]; u acts the same as d
14679 *
14680 * CAUTION: o, x and X conversions on regular ints can never
14681 * produce a '-' sign, but can for Python's unbounded ints.
14682 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014683PyObject *
14684_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014685{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014686 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014687 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014688 Py_ssize_t i;
14689 int sign; /* 1 if '-', else 0 */
14690 int len; /* number of characters */
14691 Py_ssize_t llen;
14692 int numdigits; /* len == numnondigits + numdigits */
14693 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014694
Victor Stinnerd0880d52012-04-27 23:40:13 +020014695 /* Avoid exceeding SSIZE_T_MAX */
14696 if (prec > INT_MAX-3) {
14697 PyErr_SetString(PyExc_OverflowError,
14698 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014699 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014700 }
14701
14702 assert(PyLong_Check(val));
14703
14704 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014705 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014706 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014707 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014708 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014709 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014710 /* int and int subclasses should print numerically when a numeric */
14711 /* format code is used (see issue18780) */
14712 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014713 break;
14714 case 'o':
14715 numnondigits = 2;
14716 result = PyNumber_ToBase(val, 8);
14717 break;
14718 case 'x':
14719 case 'X':
14720 numnondigits = 2;
14721 result = PyNumber_ToBase(val, 16);
14722 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014723 }
14724 if (!result)
14725 return NULL;
14726
14727 assert(unicode_modifiable(result));
14728 assert(PyUnicode_IS_READY(result));
14729 assert(PyUnicode_IS_ASCII(result));
14730
14731 /* To modify the string in-place, there can only be one reference. */
14732 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014733 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014734 PyErr_BadInternalCall();
14735 return NULL;
14736 }
14737 buf = PyUnicode_DATA(result);
14738 llen = PyUnicode_GET_LENGTH(result);
14739 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014740 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014741 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014742 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014743 return NULL;
14744 }
14745 len = (int)llen;
14746 sign = buf[0] == '-';
14747 numnondigits += sign;
14748 numdigits = len - numnondigits;
14749 assert(numdigits > 0);
14750
14751 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014752 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014753 (type == 'o' || type == 'x' || type == 'X'))) {
14754 assert(buf[sign] == '0');
14755 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14756 buf[sign+1] == 'o');
14757 numnondigits -= 2;
14758 buf += 2;
14759 len -= 2;
14760 if (sign)
14761 buf[0] = '-';
14762 assert(len == numnondigits + numdigits);
14763 assert(numdigits > 0);
14764 }
14765
14766 /* Fill with leading zeroes to meet minimum width. */
14767 if (prec > numdigits) {
14768 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14769 numnondigits + prec);
14770 char *b1;
14771 if (!r1) {
14772 Py_DECREF(result);
14773 return NULL;
14774 }
14775 b1 = PyBytes_AS_STRING(r1);
14776 for (i = 0; i < numnondigits; ++i)
14777 *b1++ = *buf++;
14778 for (i = 0; i < prec - numdigits; i++)
14779 *b1++ = '0';
14780 for (i = 0; i < numdigits; i++)
14781 *b1++ = *buf++;
14782 *b1 = '\0';
14783 Py_DECREF(result);
14784 result = r1;
14785 buf = PyBytes_AS_STRING(result);
14786 len = numnondigits + prec;
14787 }
14788
14789 /* Fix up case for hex conversions. */
14790 if (type == 'X') {
14791 /* Need to convert all lower case letters to upper case.
14792 and need to convert 0x to 0X (and -0x to -0X). */
14793 for (i = 0; i < len; i++)
14794 if (buf[i] >= 'a' && buf[i] <= 'x')
14795 buf[i] -= 'a'-'A';
14796 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014797 if (!PyUnicode_Check(result)
14798 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014799 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014800 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014801 Py_DECREF(result);
14802 result = unicode;
14803 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014804 else if (len != PyUnicode_GET_LENGTH(result)) {
14805 if (PyUnicode_Resize(&result, len) < 0)
14806 Py_CLEAR(result);
14807 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014808 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014809}
14810
Ethan Furmandf3ed242014-01-05 06:50:30 -080014811/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014812 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014813 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014814 * -1 and raise an exception on error */
14815static int
Victor Stinnera47082312012-10-04 02:19:54 +020014816mainformatlong(PyObject *v,
14817 struct unicode_format_arg_t *arg,
14818 PyObject **p_output,
14819 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014820{
14821 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014822 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014823
14824 if (!PyNumber_Check(v))
14825 goto wrongtype;
14826
Ethan Furman9ab74802014-03-21 06:38:46 -070014827 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014828 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014829 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014830 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014831 }
14832 else {
14833 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014834 }
14835 if (iobj == NULL ) {
14836 if (PyErr_ExceptionMatches(PyExc_TypeError))
14837 goto wrongtype;
14838 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014839 }
14840 assert(PyLong_Check(iobj));
14841 }
14842 else {
14843 iobj = v;
14844 Py_INCREF(iobj);
14845 }
14846
14847 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014848 && arg->width == -1 && arg->prec == -1
14849 && !(arg->flags & (F_SIGN | F_BLANK))
14850 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014851 {
14852 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014853 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014854 int base;
14855
Victor Stinnera47082312012-10-04 02:19:54 +020014856 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014857 {
14858 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014859 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014860 case 'd':
14861 case 'i':
14862 case 'u':
14863 base = 10;
14864 break;
14865 case 'o':
14866 base = 8;
14867 break;
14868 case 'x':
14869 case 'X':
14870 base = 16;
14871 break;
14872 }
14873
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014874 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14875 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014876 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014877 }
14878 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014879 return 1;
14880 }
14881
Ethan Furmanb95b5612015-01-23 20:05:18 -080014882 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014883 Py_DECREF(iobj);
14884 if (res == NULL)
14885 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014886 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014887 return 0;
14888
14889wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014890 switch(type)
14891 {
14892 case 'o':
14893 case 'x':
14894 case 'X':
14895 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014896 "%%%c format: an integer is required, "
14897 "not %.200s",
14898 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014899 break;
14900 default:
14901 PyErr_Format(PyExc_TypeError,
Serhiy Storchakae2ec0b22020-10-09 14:14:37 +030014902 "%%%c format: a real number is required, "
Victor Stinner998b8062018-09-12 00:23:25 +020014903 "not %.200s",
14904 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014905 break;
14906 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014907 return -1;
14908}
14909
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014910static Py_UCS4
14911formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014912{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014913 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014914 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014915 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014916 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014917 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014918 goto onError;
14919 }
14920 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014921 int overflow;
14922 long x = PyLong_AsLongAndOverflow(v, &overflow);
14923 if (x == -1 && PyErr_Occurred()) {
14924 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014925 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014926 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014927 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014928 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014929
Victor Stinner8faf8212011-12-08 22:14:11 +010014930 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014931 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014932 PyErr_SetString(PyExc_OverflowError,
14933 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014934 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014935 }
14936
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014937 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014938 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014939
Benjamin Peterson29060642009-01-31 22:14:21 +000014940 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014941 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014942 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014943 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014944}
14945
Victor Stinnera47082312012-10-04 02:19:54 +020014946/* Parse options of an argument: flags, width, precision.
14947 Handle also "%(name)" syntax.
14948
14949 Return 0 if the argument has been formatted into arg->str.
14950 Return 1 if the argument has been written into ctx->writer,
14951 Raise an exception and return -1 on error. */
14952static int
14953unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14954 struct unicode_format_arg_t *arg)
14955{
14956#define FORMAT_READ(ctx) \
14957 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14958
14959 PyObject *v;
14960
Victor Stinnera47082312012-10-04 02:19:54 +020014961 if (arg->ch == '(') {
14962 /* Get argument value from a dictionary. Example: "%(name)s". */
14963 Py_ssize_t keystart;
14964 Py_ssize_t keylen;
14965 PyObject *key;
14966 int pcount = 1;
14967
14968 if (ctx->dict == NULL) {
14969 PyErr_SetString(PyExc_TypeError,
14970 "format requires a mapping");
14971 return -1;
14972 }
14973 ++ctx->fmtpos;
14974 --ctx->fmtcnt;
14975 keystart = ctx->fmtpos;
14976 /* Skip over balanced parentheses */
14977 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14978 arg->ch = FORMAT_READ(ctx);
14979 if (arg->ch == ')')
14980 --pcount;
14981 else if (arg->ch == '(')
14982 ++pcount;
14983 ctx->fmtpos++;
14984 }
14985 keylen = ctx->fmtpos - keystart - 1;
14986 if (ctx->fmtcnt < 0 || pcount > 0) {
14987 PyErr_SetString(PyExc_ValueError,
14988 "incomplete format key");
14989 return -1;
14990 }
14991 key = PyUnicode_Substring(ctx->fmtstr,
14992 keystart, keystart + keylen);
14993 if (key == NULL)
14994 return -1;
14995 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014996 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014997 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014998 }
14999 ctx->args = PyObject_GetItem(ctx->dict, key);
15000 Py_DECREF(key);
15001 if (ctx->args == NULL)
15002 return -1;
15003 ctx->args_owned = 1;
15004 ctx->arglen = -1;
15005 ctx->argidx = -2;
15006 }
15007
15008 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020015009 while (--ctx->fmtcnt >= 0) {
15010 arg->ch = FORMAT_READ(ctx);
15011 ctx->fmtpos++;
15012 switch (arg->ch) {
15013 case '-': arg->flags |= F_LJUST; continue;
15014 case '+': arg->flags |= F_SIGN; continue;
15015 case ' ': arg->flags |= F_BLANK; continue;
15016 case '#': arg->flags |= F_ALT; continue;
15017 case '0': arg->flags |= F_ZERO; continue;
15018 }
15019 break;
15020 }
15021
15022 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020015023 if (arg->ch == '*') {
15024 v = unicode_format_getnextarg(ctx);
15025 if (v == NULL)
15026 return -1;
15027 if (!PyLong_Check(v)) {
15028 PyErr_SetString(PyExc_TypeError,
15029 "* wants int");
15030 return -1;
15031 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015032 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015033 if (arg->width == -1 && PyErr_Occurred())
15034 return -1;
15035 if (arg->width < 0) {
15036 arg->flags |= F_LJUST;
15037 arg->width = -arg->width;
15038 }
15039 if (--ctx->fmtcnt >= 0) {
15040 arg->ch = FORMAT_READ(ctx);
15041 ctx->fmtpos++;
15042 }
15043 }
15044 else if (arg->ch >= '0' && arg->ch <= '9') {
15045 arg->width = arg->ch - '0';
15046 while (--ctx->fmtcnt >= 0) {
15047 arg->ch = FORMAT_READ(ctx);
15048 ctx->fmtpos++;
15049 if (arg->ch < '0' || arg->ch > '9')
15050 break;
15051 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15052 mixing signed and unsigned comparison. Since arg->ch is between
15053 '0' and '9', casting to int is safe. */
15054 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15055 PyErr_SetString(PyExc_ValueError,
15056 "width too big");
15057 return -1;
15058 }
15059 arg->width = arg->width*10 + (arg->ch - '0');
15060 }
15061 }
15062
15063 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020015064 if (arg->ch == '.') {
15065 arg->prec = 0;
15066 if (--ctx->fmtcnt >= 0) {
15067 arg->ch = FORMAT_READ(ctx);
15068 ctx->fmtpos++;
15069 }
15070 if (arg->ch == '*') {
15071 v = unicode_format_getnextarg(ctx);
15072 if (v == NULL)
15073 return -1;
15074 if (!PyLong_Check(v)) {
15075 PyErr_SetString(PyExc_TypeError,
15076 "* wants int");
15077 return -1;
15078 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015079 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015080 if (arg->prec == -1 && PyErr_Occurred())
15081 return -1;
15082 if (arg->prec < 0)
15083 arg->prec = 0;
15084 if (--ctx->fmtcnt >= 0) {
15085 arg->ch = FORMAT_READ(ctx);
15086 ctx->fmtpos++;
15087 }
15088 }
15089 else if (arg->ch >= '0' && arg->ch <= '9') {
15090 arg->prec = arg->ch - '0';
15091 while (--ctx->fmtcnt >= 0) {
15092 arg->ch = FORMAT_READ(ctx);
15093 ctx->fmtpos++;
15094 if (arg->ch < '0' || arg->ch > '9')
15095 break;
15096 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15097 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015098 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015099 return -1;
15100 }
15101 arg->prec = arg->prec*10 + (arg->ch - '0');
15102 }
15103 }
15104 }
15105
15106 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15107 if (ctx->fmtcnt >= 0) {
15108 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15109 if (--ctx->fmtcnt >= 0) {
15110 arg->ch = FORMAT_READ(ctx);
15111 ctx->fmtpos++;
15112 }
15113 }
15114 }
15115 if (ctx->fmtcnt < 0) {
15116 PyErr_SetString(PyExc_ValueError,
15117 "incomplete format");
15118 return -1;
15119 }
15120 return 0;
15121
15122#undef FORMAT_READ
15123}
15124
15125/* Format one argument. Supported conversion specifiers:
15126
15127 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015128 - "i", "d", "u": int or float
15129 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015130 - "e", "E", "f", "F", "g", "G": float
15131 - "c": int or str (1 character)
15132
Victor Stinner8dbd4212012-12-04 09:30:24 +010015133 When possible, the output is written directly into the Unicode writer
15134 (ctx->writer). A string is created when padding is required.
15135
Victor Stinnera47082312012-10-04 02:19:54 +020015136 Return 0 if the argument has been formatted into *p_str,
15137 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015138 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015139static int
15140unicode_format_arg_format(struct unicode_formatter_t *ctx,
15141 struct unicode_format_arg_t *arg,
15142 PyObject **p_str)
15143{
15144 PyObject *v;
15145 _PyUnicodeWriter *writer = &ctx->writer;
15146
15147 if (ctx->fmtcnt == 0)
15148 ctx->writer.overallocate = 0;
15149
Victor Stinnera47082312012-10-04 02:19:54 +020015150 v = unicode_format_getnextarg(ctx);
15151 if (v == NULL)
15152 return -1;
15153
Victor Stinnera47082312012-10-04 02:19:54 +020015154
15155 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015156 case 's':
15157 case 'r':
15158 case 'a':
15159 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15160 /* Fast path */
15161 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15162 return -1;
15163 return 1;
15164 }
15165
15166 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15167 *p_str = v;
15168 Py_INCREF(*p_str);
15169 }
15170 else {
15171 if (arg->ch == 's')
15172 *p_str = PyObject_Str(v);
15173 else if (arg->ch == 'r')
15174 *p_str = PyObject_Repr(v);
15175 else
15176 *p_str = PyObject_ASCII(v);
15177 }
15178 break;
15179
15180 case 'i':
15181 case 'd':
15182 case 'u':
15183 case 'o':
15184 case 'x':
15185 case 'X':
15186 {
15187 int ret = mainformatlong(v, arg, p_str, writer);
15188 if (ret != 0)
15189 return ret;
15190 arg->sign = 1;
15191 break;
15192 }
15193
15194 case 'e':
15195 case 'E':
15196 case 'f':
15197 case 'F':
15198 case 'g':
15199 case 'G':
15200 if (arg->width == -1 && arg->prec == -1
15201 && !(arg->flags & (F_SIGN | F_BLANK)))
15202 {
15203 /* Fast path */
15204 if (formatfloat(v, arg, NULL, writer) == -1)
15205 return -1;
15206 return 1;
15207 }
15208
15209 arg->sign = 1;
15210 if (formatfloat(v, arg, p_str, NULL) == -1)
15211 return -1;
15212 break;
15213
15214 case 'c':
15215 {
15216 Py_UCS4 ch = formatchar(v);
15217 if (ch == (Py_UCS4) -1)
15218 return -1;
15219 if (arg->width == -1 && arg->prec == -1) {
15220 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015221 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015222 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015223 return 1;
15224 }
15225 *p_str = PyUnicode_FromOrdinal(ch);
15226 break;
15227 }
15228
15229 default:
15230 PyErr_Format(PyExc_ValueError,
15231 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015232 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015233 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15234 (int)arg->ch,
15235 ctx->fmtpos - 1);
15236 return -1;
15237 }
15238 if (*p_str == NULL)
15239 return -1;
15240 assert (PyUnicode_Check(*p_str));
15241 return 0;
15242}
15243
15244static int
15245unicode_format_arg_output(struct unicode_formatter_t *ctx,
15246 struct unicode_format_arg_t *arg,
15247 PyObject *str)
15248{
15249 Py_ssize_t len;
15250 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015251 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015252 Py_ssize_t pindex;
15253 Py_UCS4 signchar;
15254 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015255 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015256 Py_ssize_t sublen;
15257 _PyUnicodeWriter *writer = &ctx->writer;
15258 Py_UCS4 fill;
15259
15260 fill = ' ';
15261 if (arg->sign && arg->flags & F_ZERO)
15262 fill = '0';
15263
15264 if (PyUnicode_READY(str) == -1)
15265 return -1;
15266
15267 len = PyUnicode_GET_LENGTH(str);
15268 if ((arg->width == -1 || arg->width <= len)
15269 && (arg->prec == -1 || arg->prec >= len)
15270 && !(arg->flags & (F_SIGN | F_BLANK)))
15271 {
15272 /* Fast path */
15273 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15274 return -1;
15275 return 0;
15276 }
15277
15278 /* Truncate the string for "s", "r" and "a" formats
15279 if the precision is set */
15280 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15281 if (arg->prec >= 0 && len > arg->prec)
15282 len = arg->prec;
15283 }
15284
15285 /* Adjust sign and width */
15286 kind = PyUnicode_KIND(str);
15287 pbuf = PyUnicode_DATA(str);
15288 pindex = 0;
15289 signchar = '\0';
15290 if (arg->sign) {
15291 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15292 if (ch == '-' || ch == '+') {
15293 signchar = ch;
15294 len--;
15295 pindex++;
15296 }
15297 else if (arg->flags & F_SIGN)
15298 signchar = '+';
15299 else if (arg->flags & F_BLANK)
15300 signchar = ' ';
15301 else
15302 arg->sign = 0;
15303 }
15304 if (arg->width < len)
15305 arg->width = len;
15306
15307 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015308 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015309 if (!(arg->flags & F_LJUST)) {
15310 if (arg->sign) {
15311 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015312 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015313 }
15314 else {
15315 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015316 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015317 }
15318 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015319 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15320 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015321 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015322 }
15323
Victor Stinnera47082312012-10-04 02:19:54 +020015324 buflen = arg->width;
15325 if (arg->sign && len == arg->width)
15326 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015327 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015328 return -1;
15329
15330 /* Write the sign if needed */
15331 if (arg->sign) {
15332 if (fill != ' ') {
15333 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15334 writer->pos += 1;
15335 }
15336 if (arg->width > len)
15337 arg->width--;
15338 }
15339
15340 /* Write the numeric prefix for "x", "X" and "o" formats
15341 if the alternate form is used.
15342 For example, write "0x" for the "%#x" format. */
15343 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15344 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15345 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15346 if (fill != ' ') {
15347 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15348 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15349 writer->pos += 2;
15350 pindex += 2;
15351 }
15352 arg->width -= 2;
15353 if (arg->width < 0)
15354 arg->width = 0;
15355 len -= 2;
15356 }
15357
15358 /* Pad left with the fill character if needed */
15359 if (arg->width > len && !(arg->flags & F_LJUST)) {
15360 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015361 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015362 writer->pos += sublen;
15363 arg->width = len;
15364 }
15365
15366 /* If padding with spaces: write sign if needed and/or numeric prefix if
15367 the alternate form is used */
15368 if (fill == ' ') {
15369 if (arg->sign) {
15370 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15371 writer->pos += 1;
15372 }
15373 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15374 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15375 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15376 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15377 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15378 writer->pos += 2;
15379 pindex += 2;
15380 }
15381 }
15382
15383 /* Write characters */
15384 if (len) {
15385 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15386 str, pindex, len);
15387 writer->pos += len;
15388 }
15389
15390 /* Pad right with the fill character if needed */
15391 if (arg->width > len) {
15392 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015393 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015394 writer->pos += sublen;
15395 }
15396 return 0;
15397}
15398
15399/* Helper of PyUnicode_Format(): format one arg.
15400 Return 0 on success, raise an exception and return -1 on error. */
15401static int
15402unicode_format_arg(struct unicode_formatter_t *ctx)
15403{
15404 struct unicode_format_arg_t arg;
15405 PyObject *str;
15406 int ret;
15407
Victor Stinner8dbd4212012-12-04 09:30:24 +010015408 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015409 if (arg.ch == '%') {
15410 ctx->fmtpos++;
15411 ctx->fmtcnt--;
15412 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15413 return -1;
15414 return 0;
15415 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015416 arg.flags = 0;
15417 arg.width = -1;
15418 arg.prec = -1;
15419 arg.sign = 0;
15420 str = NULL;
15421
Victor Stinnera47082312012-10-04 02:19:54 +020015422 ret = unicode_format_arg_parse(ctx, &arg);
15423 if (ret == -1)
15424 return -1;
15425
15426 ret = unicode_format_arg_format(ctx, &arg, &str);
15427 if (ret == -1)
15428 return -1;
15429
15430 if (ret != 1) {
15431 ret = unicode_format_arg_output(ctx, &arg, str);
15432 Py_DECREF(str);
15433 if (ret == -1)
15434 return -1;
15435 }
15436
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015437 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015438 PyErr_SetString(PyExc_TypeError,
15439 "not all arguments converted during string formatting");
15440 return -1;
15441 }
15442 return 0;
15443}
15444
Alexander Belopolsky40018472011-02-26 01:02:56 +000015445PyObject *
15446PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015447{
Victor Stinnera47082312012-10-04 02:19:54 +020015448 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015449
Guido van Rossumd57fd912000-03-10 22:53:23 +000015450 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015451 PyErr_BadInternalCall();
15452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015453 }
Victor Stinnera47082312012-10-04 02:19:54 +020015454
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015455 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015456 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015457
15458 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015459 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15460 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15461 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15462 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015463
Victor Stinner8f674cc2013-04-17 23:02:17 +020015464 _PyUnicodeWriter_Init(&ctx.writer);
15465 ctx.writer.min_length = ctx.fmtcnt + 100;
15466 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015467
Guido van Rossumd57fd912000-03-10 22:53:23 +000015468 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015469 ctx.arglen = PyTuple_Size(args);
15470 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015471 }
15472 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015473 ctx.arglen = -1;
15474 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015475 }
Victor Stinnera47082312012-10-04 02:19:54 +020015476 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015477 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015478 ctx.dict = args;
15479 else
15480 ctx.dict = NULL;
15481 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015482
Victor Stinnera47082312012-10-04 02:19:54 +020015483 while (--ctx.fmtcnt >= 0) {
15484 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015485 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015486
15487 nonfmtpos = ctx.fmtpos++;
15488 while (ctx.fmtcnt >= 0 &&
15489 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15490 ctx.fmtpos++;
15491 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015492 }
Victor Stinnera47082312012-10-04 02:19:54 +020015493 if (ctx.fmtcnt < 0) {
15494 ctx.fmtpos--;
15495 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015496 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015497
Victor Stinnercfc4c132013-04-03 01:48:39 +020015498 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15499 nonfmtpos, ctx.fmtpos) < 0)
15500 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015501 }
15502 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015503 ctx.fmtpos++;
15504 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015505 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015506 }
15507 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015508
Victor Stinnera47082312012-10-04 02:19:54 +020015509 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015510 PyErr_SetString(PyExc_TypeError,
15511 "not all arguments converted during string formatting");
15512 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015513 }
15514
Victor Stinnera47082312012-10-04 02:19:54 +020015515 if (ctx.args_owned) {
15516 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015517 }
Victor Stinnera47082312012-10-04 02:19:54 +020015518 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015519
Benjamin Peterson29060642009-01-31 22:14:21 +000015520 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015521 _PyUnicodeWriter_Dealloc(&ctx.writer);
15522 if (ctx.args_owned) {
15523 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015524 }
15525 return NULL;
15526}
15527
Jeremy Hylton938ace62002-07-17 16:30:39 +000015528static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015529unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15530
15531/*[clinic input]
15532@classmethod
15533str.__new__ as unicode_new
15534
15535 object as x: object = NULL
15536 encoding: str = NULL
15537 errors: str = NULL
15538
15539[clinic start generated code]*/
Guido van Rossume023fe02001-08-30 03:12:59 +000015540
Tim Peters6d6c1a32001-08-02 04:15:00 +000015541static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015542unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15543 const char *errors)
15544/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
Tim Peters6d6c1a32001-08-02 04:15:00 +000015545{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015546 PyObject *unicode;
15547 if (x == NULL) {
15548 unicode = unicode_new_empty();
15549 }
15550 else if (encoding == NULL && errors == NULL) {
15551 unicode = PyObject_Str(x);
15552 }
15553 else {
15554 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15555 }
Tim Peters6d6c1a32001-08-02 04:15:00 +000015556
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015557 if (unicode != NULL && type != &PyUnicode_Type) {
15558 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15559 }
15560 return unicode;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015561}
15562
Guido van Rossume023fe02001-08-30 03:12:59 +000015563static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015564unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
Guido van Rossume023fe02001-08-30 03:12:59 +000015565{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015566 PyObject *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015567 Py_ssize_t length, char_size;
15568 int share_wstr, share_utf8;
15569 unsigned int kind;
15570 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015571
Benjamin Peterson14339b62009-01-31 16:36:08 +000015572 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner910337b2011-10-03 03:20:16 +020015573 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015574 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015575 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015576 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015577
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015578 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015579 if (self == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015580 return NULL;
15581 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015582 kind = PyUnicode_KIND(unicode);
15583 length = PyUnicode_GET_LENGTH(unicode);
15584
15585 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015586#ifdef Py_DEBUG
15587 _PyUnicode_HASH(self) = -1;
15588#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015589 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015590#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015591 _PyUnicode_STATE(self).interned = 0;
15592 _PyUnicode_STATE(self).kind = kind;
15593 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015594 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015595 _PyUnicode_STATE(self).ready = 1;
15596 _PyUnicode_WSTR(self) = NULL;
15597 _PyUnicode_UTF8_LENGTH(self) = 0;
15598 _PyUnicode_UTF8(self) = NULL;
15599 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015600 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015601
15602 share_utf8 = 0;
15603 share_wstr = 0;
15604 if (kind == PyUnicode_1BYTE_KIND) {
15605 char_size = 1;
15606 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15607 share_utf8 = 1;
15608 }
15609 else if (kind == PyUnicode_2BYTE_KIND) {
15610 char_size = 2;
15611 if (sizeof(wchar_t) == 2)
15612 share_wstr = 1;
15613 }
15614 else {
15615 assert(kind == PyUnicode_4BYTE_KIND);
15616 char_size = 4;
15617 if (sizeof(wchar_t) == 4)
15618 share_wstr = 1;
15619 }
15620
15621 /* Ensure we won't overflow the length. */
15622 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15623 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015624 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015625 }
Victor Stinner32bd68c2020-12-01 10:37:39 +010015626 data = PyObject_Malloc((length + 1) * char_size);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015627 if (data == NULL) {
15628 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015629 goto onError;
15630 }
15631
Victor Stinnerc3c74152011-10-02 20:39:55 +020015632 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015633 if (share_utf8) {
15634 _PyUnicode_UTF8_LENGTH(self) = length;
15635 _PyUnicode_UTF8(self) = data;
15636 }
15637 if (share_wstr) {
15638 _PyUnicode_WSTR_LENGTH(self) = length;
15639 _PyUnicode_WSTR(self) = (wchar_t *)data;
15640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015641
Christian Heimesf051e432016-09-13 20:22:02 +020015642 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015643 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015644 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015645#ifdef Py_DEBUG
15646 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15647#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +010015648 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015649
15650onError:
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015651 Py_DECREF(self);
15652 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015653}
15654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015655PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015656"str(object='') -> str\n\
15657str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015658\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015659Create a new string object from the given object. If encoding or\n\
15660errors is specified, then the object must expose a data buffer\n\
15661that will be decoded using the given encoding and error handler.\n\
15662Otherwise, returns the result of object.__str__() (if defined)\n\
15663or repr(object).\n\
15664encoding defaults to sys.getdefaultencoding().\n\
15665errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015666
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015667static PyObject *unicode_iter(PyObject *seq);
15668
Guido van Rossumd57fd912000-03-10 22:53:23 +000015669PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015670 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015671 "str", /* tp_name */
15672 sizeof(PyUnicodeObject), /* tp_basicsize */
15673 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015674 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015675 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015676 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015677 0, /* tp_getattr */
15678 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015679 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015680 unicode_repr, /* tp_repr */
15681 &unicode_as_number, /* tp_as_number */
15682 &unicode_as_sequence, /* tp_as_sequence */
15683 &unicode_as_mapping, /* tp_as_mapping */
15684 (hashfunc) unicode_hash, /* tp_hash*/
15685 0, /* tp_call*/
15686 (reprfunc) unicode_str, /* tp_str */
15687 PyObject_GenericGetAttr, /* tp_getattro */
15688 0, /* tp_setattro */
15689 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015690 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Brandt Bucher145bf262021-02-26 14:51:55 -080015691 Py_TPFLAGS_UNICODE_SUBCLASS |
15692 _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
Bupfc93bd42018-06-19 03:59:55 -050015693 unicode_doc, /* tp_doc */
15694 0, /* tp_traverse */
15695 0, /* tp_clear */
15696 PyUnicode_RichCompare, /* tp_richcompare */
15697 0, /* tp_weaklistoffset */
15698 unicode_iter, /* tp_iter */
15699 0, /* tp_iternext */
15700 unicode_methods, /* tp_methods */
15701 0, /* tp_members */
15702 0, /* tp_getset */
15703 &PyBaseObject_Type, /* tp_base */
15704 0, /* tp_dict */
15705 0, /* tp_descr_get */
15706 0, /* tp_descr_set */
15707 0, /* tp_dictoffset */
15708 0, /* tp_init */
15709 0, /* tp_alloc */
15710 unicode_new, /* tp_new */
15711 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015712};
15713
15714/* Initialize the Unicode implementation */
15715
Victor Stinner331a6a52019-05-27 16:39:22 +020015716PyStatus
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015717_PyUnicode_Init(PyInterpreterState *interp)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015718{
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015719 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinner91698d82020-06-25 14:07:40 +020015720 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015721 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015722 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015723
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015724 if (_Py_IsMainInterpreter(interp)) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015725 /* initialize the linebreak bloom filter */
Victor Stinner442ad742021-04-02 15:28:13 +020015726 const Py_UCS2 linebreak[] = {
15727 0x000A, /* LINE FEED */
15728 0x000D, /* CARRIAGE RETURN */
15729 0x001C, /* FILE SEPARATOR */
15730 0x001D, /* GROUP SEPARATOR */
15731 0x001E, /* RECORD SEPARATOR */
15732 0x0085, /* NEXT LINE */
15733 0x2028, /* LINE SEPARATOR */
15734 0x2029, /* PARAGRAPH SEPARATOR */
15735 };
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015736 bloom_linebreak = make_bloom_mask(
15737 PyUnicode_2BYTE_KIND, linebreak,
15738 Py_ARRAY_LENGTH(linebreak));
Victor Stinner442ad742021-04-02 15:28:13 +020015739 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015740
Victor Stinner442ad742021-04-02 15:28:13 +020015741 return _PyStatus_OK();
15742}
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015743
Victor Stinner442ad742021-04-02 15:28:13 +020015744
15745PyStatus
15746_PyUnicode_InitTypes(void)
15747{
15748 if (PyType_Ready(&PyUnicode_Type) < 0) {
15749 return _PyStatus_ERR("Can't initialize unicode type");
15750 }
15751 if (PyType_Ready(&EncodingMapType) < 0) {
15752 return _PyStatus_ERR("Can't initialize encoding map type");
15753 }
15754 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15755 return _PyStatus_ERR("Can't initialize field name iterator type");
15756 }
15757 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15758 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015759 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015760 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015761}
15762
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015763
Walter Dörwald16807132007-05-25 13:52:07 +000015764void
15765PyUnicode_InternInPlace(PyObject **p)
15766{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015767 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015768#ifdef Py_DEBUG
15769 assert(s != NULL);
15770 assert(_PyUnicode_CHECK(s));
15771#else
Victor Stinner607b1022020-05-05 18:50:30 +020015772 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015773 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015774 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015775#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015776
Benjamin Peterson14339b62009-01-31 16:36:08 +000015777 /* If it's a subclass, we don't really know what putting
15778 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015779 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015780 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015781 }
15782
15783 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015784 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015785 }
15786
Victor Stinner666ecfb2020-07-02 01:19:57 +020015787 if (PyUnicode_READY(s) == -1) {
15788 PyErr_Clear();
15789 return;
15790 }
15791
Victor Stinnerea251802020-12-26 02:58:33 +010015792 struct _Py_unicode_state *state = get_unicode_state();
15793 if (state->interned == NULL) {
15794 state->interned = PyDict_New();
15795 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015796 PyErr_Clear(); /* Don't leave an exception */
15797 return;
15798 }
15799 }
Victor Stinner607b1022020-05-05 18:50:30 +020015800
Victor Stinnerea251802020-12-26 02:58:33 +010015801 PyObject *t = PyDict_SetDefault(state->interned, s, s);
Berker Peksagced8d4c2016-07-25 04:40:39 +030015802 if (t == NULL) {
15803 PyErr_Clear();
15804 return;
15805 }
Victor Stinner607b1022020-05-05 18:50:30 +020015806
Berker Peksagced8d4c2016-07-25 04:40:39 +030015807 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015808 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015809 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015810 return;
15811 }
Victor Stinner607b1022020-05-05 18:50:30 +020015812
Victor Stinner3549ca32020-07-03 16:59:12 +020015813 /* The two references in interned dict (key and value) are not counted by
15814 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15815 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015816 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015817 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015818}
15819
Victor Stinnerea251802020-12-26 02:58:33 +010015820
Walter Dörwald16807132007-05-25 13:52:07 +000015821void
15822PyUnicode_InternImmortal(PyObject **p)
15823{
Victor Stinner583ee5a2020-10-02 14:49:00 +020015824 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15825 "PyUnicode_InternImmortal() is deprecated; "
15826 "use PyUnicode_InternInPlace() instead", 1) < 0)
15827 {
15828 // The function has no return value, the exception cannot
15829 // be reported to the caller, so just log it.
15830 PyErr_WriteUnraisable(NULL);
15831 }
15832
Benjamin Peterson14339b62009-01-31 16:36:08 +000015833 PyUnicode_InternInPlace(p);
15834 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015835 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015836 Py_INCREF(*p);
15837 }
Walter Dörwald16807132007-05-25 13:52:07 +000015838}
15839
15840PyObject *
15841PyUnicode_InternFromString(const char *cp)
15842{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015843 PyObject *s = PyUnicode_FromString(cp);
15844 if (s == NULL)
15845 return NULL;
15846 PyUnicode_InternInPlace(&s);
15847 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015848}
15849
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015850
Victor Stinner666ecfb2020-07-02 01:19:57 +020015851void
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015852_PyUnicode_ClearInterned(PyInterpreterState *interp)
Walter Dörwald16807132007-05-25 13:52:07 +000015853{
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015854 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinnerea251802020-12-26 02:58:33 +010015855 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015856 return;
15857 }
Victor Stinnerea251802020-12-26 02:58:33 +010015858 assert(PyDict_CheckExact(state->interned));
Victor Stinner666ecfb2020-07-02 01:19:57 +020015859
15860 /* Interned unicode strings are not forcibly deallocated; rather, we give
15861 them their stolen references back, and then clear and DECREF the
15862 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015863
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015864#ifdef INTERNED_STATS
Victor Stinnerea251802020-12-26 02:58:33 +010015865 fprintf(stderr, "releasing %zd interned strings\n",
15866 PyDict_GET_SIZE(state->interned));
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015867
15868 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015869#endif
Victor Stinnerea251802020-12-26 02:58:33 +010015870 Py_ssize_t pos = 0;
15871 PyObject *s, *ignored_value;
15872 while (PyDict_Next(state->interned, &pos, &s, &ignored_value)) {
Victor Stinner666ecfb2020-07-02 01:19:57 +020015873 assert(PyUnicode_IS_READY(s));
15874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015875 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015876 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015877 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015878#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015879 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015880#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015881 break;
15882 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015883 // Restore the two references (key and value) ignored
15884 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015885 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015886#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015887 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015888#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015889 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015890 case SSTATE_NOT_INTERNED:
15891 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015892 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015893 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015895 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015896 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015897#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015898 fprintf(stderr,
15899 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15900 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015901#endif
Victor Stinner666ecfb2020-07-02 01:19:57 +020015902
Victor Stinnerea251802020-12-26 02:58:33 +010015903 PyDict_Clear(state->interned);
15904 Py_CLEAR(state->interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015905}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015906
15907
15908/********************* Unicode Iterator **************************/
15909
15910typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015911 PyObject_HEAD
15912 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015913 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015914} unicodeiterobject;
15915
15916static void
15917unicodeiter_dealloc(unicodeiterobject *it)
15918{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015919 _PyObject_GC_UNTRACK(it);
15920 Py_XDECREF(it->it_seq);
15921 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015922}
15923
15924static int
15925unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15926{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015927 Py_VISIT(it->it_seq);
15928 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015929}
15930
15931static PyObject *
15932unicodeiter_next(unicodeiterobject *it)
15933{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015934 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015935
Benjamin Peterson14339b62009-01-31 16:36:08 +000015936 assert(it != NULL);
15937 seq = it->it_seq;
15938 if (seq == NULL)
15939 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015940 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015942 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15943 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015944 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015945 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15946 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015947 if (item != NULL)
15948 ++it->it_index;
15949 return item;
15950 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015951
Benjamin Peterson14339b62009-01-31 16:36:08 +000015952 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015953 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015954 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015955}
15956
15957static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015958unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015959{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015960 Py_ssize_t len = 0;
15961 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015962 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015963 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015964}
15965
15966PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15967
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015968static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015969unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015970{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015971 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015972 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015973 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015974 it->it_seq, it->it_index);
15975 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015976 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015977 if (u == NULL)
15978 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015979 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015980 }
15981}
15982
15983PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15984
15985static PyObject *
15986unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15987{
15988 Py_ssize_t index = PyLong_AsSsize_t(state);
15989 if (index == -1 && PyErr_Occurred())
15990 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015991 if (it->it_seq != NULL) {
15992 if (index < 0)
15993 index = 0;
15994 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15995 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15996 it->it_index = index;
15997 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015998 Py_RETURN_NONE;
15999}
16000
16001PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16002
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016003static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000016004 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000016005 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016006 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
16007 reduce_doc},
16008 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
16009 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000016010 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016011};
16012
16013PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000016014 PyVarObject_HEAD_INIT(&PyType_Type, 0)
16015 "str_iterator", /* tp_name */
16016 sizeof(unicodeiterobject), /* tp_basicsize */
16017 0, /* tp_itemsize */
16018 /* methods */
16019 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020016020 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000016021 0, /* tp_getattr */
16022 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020016023 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000016024 0, /* tp_repr */
16025 0, /* tp_as_number */
16026 0, /* tp_as_sequence */
16027 0, /* tp_as_mapping */
16028 0, /* tp_hash */
16029 0, /* tp_call */
16030 0, /* tp_str */
16031 PyObject_GenericGetAttr, /* tp_getattro */
16032 0, /* tp_setattro */
16033 0, /* tp_as_buffer */
16034 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16035 0, /* tp_doc */
16036 (traverseproc)unicodeiter_traverse, /* tp_traverse */
16037 0, /* tp_clear */
16038 0, /* tp_richcompare */
16039 0, /* tp_weaklistoffset */
16040 PyObject_SelfIter, /* tp_iter */
16041 (iternextfunc)unicodeiter_next, /* tp_iternext */
16042 unicodeiter_methods, /* tp_methods */
16043 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016044};
16045
16046static PyObject *
16047unicode_iter(PyObject *seq)
16048{
Benjamin Peterson14339b62009-01-31 16:36:08 +000016049 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016050
Benjamin Peterson14339b62009-01-31 16:36:08 +000016051 if (!PyUnicode_Check(seq)) {
16052 PyErr_BadInternalCall();
16053 return NULL;
16054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020016055 if (PyUnicode_READY(seq) == -1)
16056 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016057 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16058 if (it == NULL)
16059 return NULL;
16060 it->it_index = 0;
16061 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020016062 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016063 _PyObject_GC_TRACK(it);
16064 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016065}
16066
Victor Stinner709d23d2019-05-02 14:56:30 -040016067static int
16068encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016069{
Victor Stinner709d23d2019-05-02 14:56:30 -040016070 int res;
16071 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16072 if (res == -2) {
16073 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16074 return -1;
16075 }
16076 if (res < 0) {
16077 PyErr_NoMemory();
16078 return -1;
16079 }
16080 return 0;
16081}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016082
Victor Stinner709d23d2019-05-02 14:56:30 -040016083
16084static int
16085config_get_codec_name(wchar_t **config_encoding)
16086{
16087 char *encoding;
16088 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16089 return -1;
16090 }
16091
16092 PyObject *name_obj = NULL;
16093 PyObject *codec = _PyCodec_Lookup(encoding);
16094 PyMem_RawFree(encoding);
16095
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016096 if (!codec)
16097 goto error;
16098
16099 name_obj = PyObject_GetAttrString(codec, "name");
16100 Py_CLEAR(codec);
16101 if (!name_obj) {
16102 goto error;
16103 }
16104
Victor Stinner709d23d2019-05-02 14:56:30 -040016105 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16106 Py_DECREF(name_obj);
16107 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016108 goto error;
16109 }
16110
Victor Stinner709d23d2019-05-02 14:56:30 -040016111 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16112 if (raw_wname == NULL) {
16113 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016114 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016115 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016116 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016117
16118 PyMem_RawFree(*config_encoding);
16119 *config_encoding = raw_wname;
16120
16121 PyMem_Free(wname);
16122 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016123
16124error:
16125 Py_XDECREF(codec);
16126 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016127 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016128}
16129
16130
Victor Stinner331a6a52019-05-27 16:39:22 +020016131static PyStatus
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016132init_stdio_encoding(PyInterpreterState *interp)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016133{
Victor Stinner709d23d2019-05-02 14:56:30 -040016134 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016135 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016136 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016137 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016138 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016139 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016140 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016141}
16142
16143
Victor Stinner709d23d2019-05-02 14:56:30 -040016144static int
16145init_fs_codec(PyInterpreterState *interp)
16146{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016147 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016148
16149 _Py_error_handler error_handler;
16150 error_handler = get_error_handler_wide(config->filesystem_errors);
16151 if (error_handler == _Py_ERROR_UNKNOWN) {
16152 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16153 return -1;
16154 }
16155
16156 char *encoding, *errors;
16157 if (encode_wstr_utf8(config->filesystem_encoding,
16158 &encoding,
16159 "filesystem_encoding") < 0) {
16160 return -1;
16161 }
16162
16163 if (encode_wstr_utf8(config->filesystem_errors,
16164 &errors,
16165 "filesystem_errors") < 0) {
16166 PyMem_RawFree(encoding);
16167 return -1;
16168 }
16169
Victor Stinner3d17c042020-05-14 01:48:38 +020016170 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16171 PyMem_RawFree(fs_codec->encoding);
16172 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016173 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016174 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16175 PyMem_RawFree(fs_codec->errors);
16176 fs_codec->errors = errors;
16177 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016178
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016179#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016180 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016181#endif
16182
Victor Stinner709d23d2019-05-02 14:56:30 -040016183 /* At this point, PyUnicode_EncodeFSDefault() and
16184 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16185 the C implementation of the filesystem encoding. */
16186
16187 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16188 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016189 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16190 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016191 PyErr_NoMemory();
16192 return -1;
16193 }
16194 return 0;
16195}
16196
16197
Victor Stinner331a6a52019-05-27 16:39:22 +020016198static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016199init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016200{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016201 PyInterpreterState *interp = tstate->interp;
16202
Victor Stinner709d23d2019-05-02 14:56:30 -040016203 /* Update the filesystem encoding to the normalized Python codec name.
16204 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16205 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016206 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016207 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016208 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016209 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016210 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016211 }
16212
Victor Stinner709d23d2019-05-02 14:56:30 -040016213 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016214 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016215 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016216 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016217}
16218
16219
Victor Stinner331a6a52019-05-27 16:39:22 +020016220PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016221_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016222{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016223 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016224 if (_PyStatus_EXCEPTION(status)) {
16225 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016226 }
16227
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016228 return init_stdio_encoding(tstate->interp);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016229}
16230
16231
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016232static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016233_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016234{
Victor Stinner3d17c042020-05-14 01:48:38 +020016235 PyMem_RawFree(fs_codec->encoding);
16236 fs_codec->encoding = NULL;
16237 fs_codec->utf8 = 0;
16238 PyMem_RawFree(fs_codec->errors);
16239 fs_codec->errors = NULL;
16240 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016241}
16242
16243
Victor Stinner709d23d2019-05-02 14:56:30 -040016244#ifdef MS_WINDOWS
16245int
16246_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16247{
Victor Stinner81a7be32020-04-14 15:14:01 +020016248 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016249 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016250
16251 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16252 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16253 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16254 if (encoding == NULL || errors == NULL) {
16255 PyMem_RawFree(encoding);
16256 PyMem_RawFree(errors);
16257 PyErr_NoMemory();
16258 return -1;
16259 }
16260
16261 PyMem_RawFree(config->filesystem_encoding);
16262 config->filesystem_encoding = encoding;
16263 PyMem_RawFree(config->filesystem_errors);
16264 config->filesystem_errors = errors;
16265
16266 return init_fs_codec(interp);
16267}
16268#endif
16269
16270
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016271void
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016272_PyUnicode_Fini(PyInterpreterState *interp)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016273{
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016274 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016275
Victor Stinnerea251802020-12-26 02:58:33 +010016276 // _PyUnicode_ClearInterned() must be called before
16277 assert(state->interned == NULL);
16278
16279 _PyUnicode_FiniEncodings(&state->fs_codec);
16280
Victor Stinnerf4507232020-12-26 20:26:08 +010016281 unicode_clear_identifiers(state);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016282
Victor Stinner2f9ada92020-06-24 02:22:21 +020016283 for (Py_ssize_t i = 0; i < 256; i++) {
16284 Py_CLEAR(state->latin1[i]);
16285 }
Victor Stinnerea251802020-12-26 02:58:33 +010016286 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016287}
16288
16289
Georg Brandl66c221e2010-10-14 07:04:07 +000016290/* A _string module, to export formatter_parser and formatter_field_name_split
16291 to the string.Formatter class implemented in Python. */
16292
16293static PyMethodDef _string_methods[] = {
16294 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16295 METH_O, PyDoc_STR("split the argument as a field name")},
16296 {"formatter_parser", (PyCFunction) formatter_parser,
16297 METH_O, PyDoc_STR("parse the argument as a format string")},
16298 {NULL, NULL}
16299};
16300
16301static struct PyModuleDef _string_module = {
16302 PyModuleDef_HEAD_INIT,
Victor Stinnerbb083d32020-09-08 15:33:08 +020016303 .m_name = "_string",
16304 .m_doc = PyDoc_STR("string helper module"),
16305 .m_size = 0,
16306 .m_methods = _string_methods,
Georg Brandl66c221e2010-10-14 07:04:07 +000016307};
16308
16309PyMODINIT_FUNC
16310PyInit__string(void)
16311{
Victor Stinnerbb083d32020-09-08 15:33:08 +020016312 return PyModuleDef_Init(&_string_module);
Georg Brandl66c221e2010-10-14 07:04:07 +000016313}
16314
16315
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016316#ifdef __cplusplus
16317}
16318#endif