blob: 409355534a2ce180482a5a4df50bf6582503ffc1 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010043#include "pycore_abstract.h" // _PyIndex_Check()
44#include "pycore_bytes_methods.h" // _Py_bytes_lower()
Serhiy Storchaka2ad93822020-12-03 12:46:16 +020045#include "pycore_format.h" // F_LJUST
Victor Stinner47e1afd2020-10-26 16:43:47 +010046#include "pycore_initconfig.h" // _PyStatus_OK()
47#include "pycore_interp.h" // PyInterpreterState.fs_codec
48#include "pycore_object.h" // _PyObject_GC_TRACK()
49#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
50#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
51#include "pycore_pystate.h" // _PyInterpreterState_GET()
52#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
53#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000055#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000056#include <windows.h>
57#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000058
Victor Stinner666ecfb2020-07-02 01:19:57 +020059/* Uncomment to display statistics on interned strings at exit
60 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010061/* #define INTERNED_STATS 1 */
62
63
Larry Hastings61272b72014-01-07 12:41:53 -080064/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090065class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080066[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090067/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69/*[python input]
70class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80[python start generated code]*/
81/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080082
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchaka05997252013-01-26 12:14:02 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner8faf8212011-12-08 22:14:11 +010096/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97#define MAX_UNICODE 0x10ffff
98
Victor Stinner910337b2011-10-03 03:20:16 +020099#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200100# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#else
102# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200104
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105#define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200108 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200113#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200116 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200121#define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900123
124/* Don't use deprecated macro of unicodeobject.h */
125#undef PyUnicode_WSTR_LENGTH
126#define PyUnicode_WSTR_LENGTH(op) \
127 (PyUnicode_IS_COMPACT_ASCII(op) ? \
128 ((PyASCIIObject*)op)->length : \
129 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_WSTR_LENGTH(op) \
131 (((PyCompactUnicodeObject*)(op))->wstr_length)
132#define _PyUnicode_LENGTH(op) \
133 (((PyASCIIObject *)(op))->length)
134#define _PyUnicode_STATE(op) \
135 (((PyASCIIObject *)(op))->state)
136#define _PyUnicode_HASH(op) \
137 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200138#define _PyUnicode_KIND(op) \
139 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200140 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200141#define _PyUnicode_GET_LENGTH(op) \
142 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200143 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200144#define _PyUnicode_DATA_ANY(op) \
145 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200146
Victor Stinner910337b2011-10-03 03:20:16 +0200147#undef PyUnicode_READY
148#define PyUnicode_READY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200151 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100152 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200153
Victor Stinnerc379ead2011-10-03 12:52:27 +0200154#define _PyUnicode_SHARE_UTF8(op) \
155 (assert(_PyUnicode_CHECK(op)), \
156 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
157 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
158#define _PyUnicode_SHARE_WSTR(op) \
159 (assert(_PyUnicode_CHECK(op)), \
160 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
161
Victor Stinner829c0ad2011-10-03 01:08:02 +0200162/* true if the Unicode object has an allocated UTF-8 memory block
163 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200164#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200165 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
168
Victor Stinner03490912011-10-03 23:45:12 +0200169/* true if the Unicode object has an allocated wstr memory block
170 (not shared with other data) */
171#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200172 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200173 (!PyUnicode_IS_READY(op) || \
174 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
175
Victor Stinner910337b2011-10-03 03:20:16 +0200176/* Generic helper macro to convert characters of different types.
177 from_type and to_type have to be valid type names, begin and end
178 are pointers to the source characters which should be of type
179 "from_type *". to is a pointer of type "to_type *" and points to the
180 buffer where the result characters are written to. */
181#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
182 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100183 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600184 const from_type *_iter = (const from_type *)(begin);\
185 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 Py_ssize_t n = (_end) - (_iter); \
187 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200188 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200189 while (_iter < (_unrolled_end)) { \
190 _to[0] = (to_type) _iter[0]; \
191 _to[1] = (to_type) _iter[1]; \
192 _to[2] = (to_type) _iter[2]; \
193 _to[3] = (to_type) _iter[3]; \
194 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200196 while (_iter < (_end)) \
197 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200198 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200199
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200200#ifdef MS_WINDOWS
201 /* On Windows, overallocate by 50% is the best factor */
202# define OVERALLOCATE_FACTOR 2
203#else
204 /* On Linux, overallocate by 25% is the best factor */
205# define OVERALLOCATE_FACTOR 4
206#endif
207
Victor Stinner607b1022020-05-05 18:50:30 +0200208/* bpo-40521: Interned strings are shared by all interpreters. */
209#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
210# define INTERNED_STRINGS
211#endif
212
Walter Dörwald16807132007-05-25 13:52:07 +0000213/* This dictionary holds all interned unicode strings. Note that references
214 to strings in this dictionary are *not* counted in the string's ob_refcnt.
215 When the interned string reaches a refcnt of 0 the string deallocation
216 function will delete the reference from this dictionary.
217
218 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000219 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000220*/
Victor Stinner607b1022020-05-05 18:50:30 +0200221#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200223#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000224
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200225static struct _Py_unicode_state*
226get_unicode_state(void)
227{
228 PyInterpreterState *interp = _PyInterpreterState_GET();
229 return &interp->unicode;
230}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200231
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000232
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200234static inline PyObject* unicode_get_empty(void)
235{
236 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200237 // unicode_get_empty() must not be called before _PyUnicode_Init()
238 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200239 assert(state->empty_string != NULL);
240 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200241}
242
Victor Stinner91698d82020-06-25 14:07:40 +0200243
244// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200245static inline PyObject* unicode_new_empty(void)
246{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200247 PyObject *empty = unicode_get_empty();
248 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200249 return empty;
250}
251
252#define _Py_RETURN_UNICODE_EMPTY() \
253 do { \
254 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200255 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256
Victor Stinner59423e32018-11-26 13:40:01 +0100257static inline void
258unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
259 Py_ssize_t start, Py_ssize_t length)
260{
261 assert(0 <= start);
262 assert(kind != PyUnicode_WCHAR_KIND);
263 switch (kind) {
264 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100265 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100266 Py_UCS1 ch = (unsigned char)value;
267 Py_UCS1 *to = (Py_UCS1 *)data + start;
268 memset(to, ch, length);
269 break;
270 }
271 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100272 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100273 Py_UCS2 ch = (Py_UCS2)value;
274 Py_UCS2 *to = (Py_UCS2 *)data + start;
275 const Py_UCS2 *end = to + length;
276 for (; to < end; ++to) *to = ch;
277 break;
278 }
279 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100280 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100281 Py_UCS4 ch = value;
282 Py_UCS4 * to = (Py_UCS4 *)data + start;
283 const Py_UCS4 *end = to + length;
284 for (; to < end; ++to) *to = ch;
285 break;
286 }
287 default: Py_UNREACHABLE();
288 }
289}
290
291
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200292/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700293static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200294_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900295static inline void
296_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400297static PyObject *
298unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
299 const char *errors);
300static PyObject *
301unicode_decode_utf8(const char *s, Py_ssize_t size,
302 _Py_error_handler error_handler, const char *errors,
303 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200304
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200305/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200306static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200307
Christian Heimes190d79e2008-01-30 11:58:22 +0000308/* Fast detection of the most frequent whitespace characters */
309const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000310 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000311/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000312/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000313/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000314/* case 0x000C: * FORM FEED */
315/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000316 0, 1, 1, 1, 1, 1, 0, 0,
317 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000318/* case 0x001C: * FILE SEPARATOR */
319/* case 0x001D: * GROUP SEPARATOR */
320/* case 0x001E: * RECORD SEPARATOR */
321/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000322 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000323/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000324 1, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000328
Benjamin Peterson14339b62009-01-31 16:36:08 +0000329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0,
336 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000337};
338
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200339/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200340static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200341static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100342static int unicode_modifiable(PyObject *unicode);
343
Victor Stinnerfe226c02011-10-03 03:52:20 +0200344
Alexander Belopolsky40018472011-02-26 01:02:56 +0000345static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100346_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200347static PyObject *
348_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
349static PyObject *
350_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
351
352static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000353unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000354 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100355 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000356 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
357
Alexander Belopolsky40018472011-02-26 01:02:56 +0000358static void
359raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300360 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100361 PyObject *unicode,
362 Py_ssize_t startpos, Py_ssize_t endpos,
363 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000364
Christian Heimes190d79e2008-01-30 11:58:22 +0000365/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200366static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000368/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000369/* 0x000B, * LINE TABULATION */
370/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000371/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000372 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000373 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000374/* 0x001C, * FILE SEPARATOR */
375/* 0x001D, * GROUP SEPARATOR */
376/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000377 0, 0, 0, 0, 1, 1, 1, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
381 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000382
Benjamin Peterson14339b62009-01-31 16:36:08 +0000383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0,
390 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000391};
392
INADA Naoki3ae20562017-01-16 20:41:20 +0900393static int convert_uc(PyObject *obj, void *addr);
394
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300395#include "clinic/unicodeobject.c.h"
396
Victor Stinner3d4226a2018-08-29 22:21:32 +0200397_Py_error_handler
398_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200399{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200400 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200401 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200402 }
403 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200404 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200405 }
406 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200407 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200408 }
409 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200410 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200411 }
412 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200413 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200414 }
415 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200416 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200417 }
418 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200419 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200420 }
Victor Stinner50149202015-09-22 00:26:54 +0200421 return _Py_ERROR_OTHER;
422}
423
Victor Stinner709d23d2019-05-02 14:56:30 -0400424
425static _Py_error_handler
426get_error_handler_wide(const wchar_t *errors)
427{
428 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
429 return _Py_ERROR_STRICT;
430 }
431 if (wcscmp(errors, L"surrogateescape") == 0) {
432 return _Py_ERROR_SURROGATEESCAPE;
433 }
434 if (wcscmp(errors, L"replace") == 0) {
435 return _Py_ERROR_REPLACE;
436 }
437 if (wcscmp(errors, L"ignore") == 0) {
438 return _Py_ERROR_IGNORE;
439 }
440 if (wcscmp(errors, L"backslashreplace") == 0) {
441 return _Py_ERROR_BACKSLASHREPLACE;
442 }
443 if (wcscmp(errors, L"surrogatepass") == 0) {
444 return _Py_ERROR_SURROGATEPASS;
445 }
446 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
447 return _Py_ERROR_XMLCHARREFREPLACE;
448 }
449 return _Py_ERROR_OTHER;
450}
451
452
Victor Stinner22eb6892019-06-26 00:51:05 +0200453static inline int
454unicode_check_encoding_errors(const char *encoding, const char *errors)
455{
456 if (encoding == NULL && errors == NULL) {
457 return 0;
458 }
459
Victor Stinner81a7be32020-04-14 15:14:01 +0200460 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200461#ifndef Py_DEBUG
462 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200463 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200464 return 0;
465 }
466#else
467 /* Always check in debug mode */
468#endif
469
470 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
471 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200472 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200473 return 0;
474 }
475
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200476 /* Disable checks during Python finalization. For example, it allows to
477 call _PyObject_Dump() during finalization for debugging purpose. */
478 if (interp->finalizing) {
479 return 0;
480 }
481
Victor Stinner22eb6892019-06-26 00:51:05 +0200482 if (encoding != NULL) {
483 PyObject *handler = _PyCodec_Lookup(encoding);
484 if (handler == NULL) {
485 return -1;
486 }
487 Py_DECREF(handler);
488 }
489
490 if (errors != NULL) {
491 PyObject *handler = PyCodec_LookupError(errors);
492 if (handler == NULL) {
493 return -1;
494 }
495 Py_DECREF(handler);
496 }
497 return 0;
498}
499
500
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200501int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100502_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200503{
Victor Stinner68762572019-10-07 18:42:01 +0200504#define CHECK(expr) \
505 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
506
Victor Stinner910337b2011-10-03 03:20:16 +0200507 PyASCIIObject *ascii;
508 unsigned int kind;
509
Victor Stinner68762572019-10-07 18:42:01 +0200510 assert(op != NULL);
511 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200512
513 ascii = (PyASCIIObject *)op;
514 kind = ascii->state.kind;
515
Victor Stinnera3b334d2011-10-03 13:53:37 +0200516 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200517 CHECK(kind == PyUnicode_1BYTE_KIND);
518 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200519 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200520 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200521 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200522 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200523
Victor Stinnera41463c2011-10-04 01:05:08 +0200524 if (ascii->state.compact == 1) {
525 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200526 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200527 || kind == PyUnicode_2BYTE_KIND
528 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200529 CHECK(ascii->state.ascii == 0);
530 CHECK(ascii->state.ready == 1);
531 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100532 }
533 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200534 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
535
536 data = unicode->data.any;
537 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(ascii->length == 0);
539 CHECK(ascii->hash == -1);
540 CHECK(ascii->state.compact == 0);
541 CHECK(ascii->state.ascii == 0);
542 CHECK(ascii->state.ready == 0);
543 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
544 CHECK(ascii->wstr != NULL);
545 CHECK(data == NULL);
546 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200547 }
548 else {
Victor Stinner68762572019-10-07 18:42:01 +0200549 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200550 || kind == PyUnicode_2BYTE_KIND
551 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200552 CHECK(ascii->state.compact == 0);
553 CHECK(ascii->state.ready == 1);
554 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200555 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200556 CHECK(compact->utf8 == data);
557 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200558 }
559 else
Victor Stinner68762572019-10-07 18:42:01 +0200560 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200561 }
562 }
563 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200564 if (
565#if SIZEOF_WCHAR_T == 2
566 kind == PyUnicode_2BYTE_KIND
567#else
568 kind == PyUnicode_4BYTE_KIND
569#endif
570 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200571 {
Victor Stinner68762572019-10-07 18:42:01 +0200572 CHECK(ascii->wstr == data);
573 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200574 } else
Victor Stinner68762572019-10-07 18:42:01 +0200575 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200576 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200577
578 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200579 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200580 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200581 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200582 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200583
584 /* check that the best kind is used: O(n) operation */
585 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200586 Py_ssize_t i;
587 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300588 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200589 Py_UCS4 ch;
590
591 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200592 for (i=0; i < ascii->length; i++)
593 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200594 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200595 if (ch > maxchar)
596 maxchar = ch;
597 }
598 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100599 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200600 CHECK(maxchar >= 128);
601 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100602 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200603 else
Victor Stinner68762572019-10-07 18:42:01 +0200604 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200605 }
Victor Stinner77faf692011-11-20 18:56:05 +0100606 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200607 CHECK(maxchar >= 0x100);
608 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100609 }
610 else {
Victor Stinner68762572019-10-07 18:42:01 +0200611 CHECK(maxchar >= 0x10000);
612 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100613 }
Victor Stinner68762572019-10-07 18:42:01 +0200614 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200615 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400616 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200617
618#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400619}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200620
Victor Stinner910337b2011-10-03 03:20:16 +0200621
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100622static PyObject*
623unicode_result_wchar(PyObject *unicode)
624{
625#ifndef Py_DEBUG
626 Py_ssize_t len;
627
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100628 len = _PyUnicode_WSTR_LENGTH(unicode);
629 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100630 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200631 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100632 }
633
634 if (len == 1) {
635 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100636 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100637 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200638 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100639 }
640 }
641
642 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200643 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100644 return NULL;
645 }
646#else
Victor Stinneraa771272012-10-04 02:32:58 +0200647 assert(Py_REFCNT(unicode) == 1);
648
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100649 /* don't make the result ready in debug mode to ensure that the caller
650 makes the string ready before using it */
651 assert(_PyUnicode_CheckConsistency(unicode, 1));
652#endif
653 return unicode;
654}
655
656static PyObject*
657unicode_result_ready(PyObject *unicode)
658{
659 Py_ssize_t length;
660
661 length = PyUnicode_GET_LENGTH(unicode);
662 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200663 PyObject *empty = unicode_get_empty();
664 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100665 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200666 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100667 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200668 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100669 }
670
671 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200672 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200673 if (kind == PyUnicode_1BYTE_KIND) {
674 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
675 Py_UCS1 ch = data[0];
676 struct _Py_unicode_state *state = get_unicode_state();
677 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100678 if (latin1_char != NULL) {
679 if (unicode != latin1_char) {
680 Py_INCREF(latin1_char);
681 Py_DECREF(unicode);
682 }
683 return latin1_char;
684 }
685 else {
686 assert(_PyUnicode_CheckConsistency(unicode, 1));
687 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200688 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100689 return unicode;
690 }
691 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200692 else {
693 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
694 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100695 }
696
697 assert(_PyUnicode_CheckConsistency(unicode, 1));
698 return unicode;
699}
700
701static PyObject*
702unicode_result(PyObject *unicode)
703{
704 assert(_PyUnicode_CHECK(unicode));
705 if (PyUnicode_IS_READY(unicode))
706 return unicode_result_ready(unicode);
707 else
708 return unicode_result_wchar(unicode);
709}
710
Victor Stinnerc4b49542011-12-11 22:44:26 +0100711static PyObject*
712unicode_result_unchanged(PyObject *unicode)
713{
714 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500715 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100716 return NULL;
717 Py_INCREF(unicode);
718 return unicode;
719 }
720 else
721 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100722 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100723}
724
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200725/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
726 ASCII, Latin1, UTF-8, etc. */
727static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200728backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200729 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
730{
Victor Stinnerad771582015-10-09 12:38:53 +0200731 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200732 Py_UCS4 ch;
733 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300734 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735
736 assert(PyUnicode_IS_READY(unicode));
737 kind = PyUnicode_KIND(unicode);
738 data = PyUnicode_DATA(unicode);
739
740 size = 0;
741 /* determine replacement size */
742 for (i = collstart; i < collend; ++i) {
743 Py_ssize_t incr;
744
745 ch = PyUnicode_READ(kind, data, i);
746 if (ch < 0x100)
747 incr = 2+2;
748 else if (ch < 0x10000)
749 incr = 2+4;
750 else {
751 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200752 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200753 }
754 if (size > PY_SSIZE_T_MAX - incr) {
755 PyErr_SetString(PyExc_OverflowError,
756 "encoded result is too long for a Python string");
757 return NULL;
758 }
759 size += incr;
760 }
761
Victor Stinnerad771582015-10-09 12:38:53 +0200762 str = _PyBytesWriter_Prepare(writer, str, size);
763 if (str == NULL)
764 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200765
766 /* generate replacement */
767 for (i = collstart; i < collend; ++i) {
768 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200769 *str++ = '\\';
770 if (ch >= 0x00010000) {
771 *str++ = 'U';
772 *str++ = Py_hexdigits[(ch>>28)&0xf];
773 *str++ = Py_hexdigits[(ch>>24)&0xf];
774 *str++ = Py_hexdigits[(ch>>20)&0xf];
775 *str++ = Py_hexdigits[(ch>>16)&0xf];
776 *str++ = Py_hexdigits[(ch>>12)&0xf];
777 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778 }
Victor Stinner797485e2015-10-09 03:17:30 +0200779 else if (ch >= 0x100) {
780 *str++ = 'u';
781 *str++ = Py_hexdigits[(ch>>12)&0xf];
782 *str++ = Py_hexdigits[(ch>>8)&0xf];
783 }
784 else
785 *str++ = 'x';
786 *str++ = Py_hexdigits[(ch>>4)&0xf];
787 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200788 }
789 return str;
790}
791
792/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
793 ASCII, Latin1, UTF-8, etc. */
794static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200795xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200796 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
797{
Victor Stinnerad771582015-10-09 12:38:53 +0200798 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200799 Py_UCS4 ch;
800 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300801 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200802
803 assert(PyUnicode_IS_READY(unicode));
804 kind = PyUnicode_KIND(unicode);
805 data = PyUnicode_DATA(unicode);
806
807 size = 0;
808 /* determine replacement size */
809 for (i = collstart; i < collend; ++i) {
810 Py_ssize_t incr;
811
812 ch = PyUnicode_READ(kind, data, i);
813 if (ch < 10)
814 incr = 2+1+1;
815 else if (ch < 100)
816 incr = 2+2+1;
817 else if (ch < 1000)
818 incr = 2+3+1;
819 else if (ch < 10000)
820 incr = 2+4+1;
821 else if (ch < 100000)
822 incr = 2+5+1;
823 else if (ch < 1000000)
824 incr = 2+6+1;
825 else {
826 assert(ch <= MAX_UNICODE);
827 incr = 2+7+1;
828 }
829 if (size > PY_SSIZE_T_MAX - incr) {
830 PyErr_SetString(PyExc_OverflowError,
831 "encoded result is too long for a Python string");
832 return NULL;
833 }
834 size += incr;
835 }
836
Victor Stinnerad771582015-10-09 12:38:53 +0200837 str = _PyBytesWriter_Prepare(writer, str, size);
838 if (str == NULL)
839 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200840
841 /* generate replacement */
842 for (i = collstart; i < collend; ++i) {
Christian Heimes07f2ade2020-11-18 16:38:53 +0100843 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
844 if (size < 0) {
845 return NULL;
846 }
847 str += size;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200848 }
849 return str;
850}
851
Thomas Wouters477c8d52006-05-27 19:21:47 +0000852/* --- Bloom Filters ----------------------------------------------------- */
853
854/* stuff to implement simple "bloom filters" for Unicode characters.
855 to keep things simple, we use a single bitmask, using the least 5
856 bits from each unicode characters as the bit index. */
857
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200858/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000859
Antoine Pitrouf068f942010-01-13 14:19:12 +0000860#if LONG_BIT >= 128
861#define BLOOM_WIDTH 128
862#elif LONG_BIT >= 64
863#define BLOOM_WIDTH 64
864#elif LONG_BIT >= 32
865#define BLOOM_WIDTH 32
866#else
867#error "LONG_BIT is smaller than 32"
868#endif
869
Thomas Wouters477c8d52006-05-27 19:21:47 +0000870#define BLOOM_MASK unsigned long
871
Serhiy Storchaka05997252013-01-26 12:14:02 +0200872static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000873
Antoine Pitrouf068f942010-01-13 14:19:12 +0000874#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000875
Benjamin Peterson29060642009-01-31 22:14:21 +0000876#define BLOOM_LINEBREAK(ch) \
877 ((ch) < 128U ? ascii_linebreak[(ch)] : \
878 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000879
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700880static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300881make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882{
Victor Stinnera85af502013-04-09 21:53:54 +0200883#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
884 do { \
885 TYPE *data = (TYPE *)PTR; \
886 TYPE *end = data + LEN; \
887 Py_UCS4 ch; \
888 for (; data != end; data++) { \
889 ch = *data; \
890 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
891 } \
892 break; \
893 } while (0)
894
Thomas Wouters477c8d52006-05-27 19:21:47 +0000895 /* calculate simple bloom-style bitmask for a given unicode string */
896
Antoine Pitrouf068f942010-01-13 14:19:12 +0000897 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000898
899 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200900 switch (kind) {
901 case PyUnicode_1BYTE_KIND:
902 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
903 break;
904 case PyUnicode_2BYTE_KIND:
905 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
906 break;
907 case PyUnicode_4BYTE_KIND:
908 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
909 break;
910 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700911 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200912 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000913 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200914
915#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000916}
917
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300918static int
919ensure_unicode(PyObject *obj)
920{
921 if (!PyUnicode_Check(obj)) {
922 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200923 "must be str, not %.100s",
924 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300925 return -1;
926 }
927 return PyUnicode_READY(obj);
928}
929
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200930/* Compilation of templated routines */
931
Victor Stinner90ed8a62020-06-24 00:34:07 +0200932#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200933
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200934#include "stringlib/asciilib.h"
935#include "stringlib/fastsearch.h"
936#include "stringlib/partition.h"
937#include "stringlib/split.h"
938#include "stringlib/count.h"
939#include "stringlib/find.h"
940#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200941#include "stringlib/undef.h"
942
943#include "stringlib/ucs1lib.h"
944#include "stringlib/fastsearch.h"
945#include "stringlib/partition.h"
946#include "stringlib/split.h"
947#include "stringlib/count.h"
948#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300949#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200950#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200951#include "stringlib/undef.h"
952
953#include "stringlib/ucs2lib.h"
954#include "stringlib/fastsearch.h"
955#include "stringlib/partition.h"
956#include "stringlib/split.h"
957#include "stringlib/count.h"
958#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300959#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200960#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200961#include "stringlib/undef.h"
962
963#include "stringlib/ucs4lib.h"
964#include "stringlib/fastsearch.h"
965#include "stringlib/partition.h"
966#include "stringlib/split.h"
967#include "stringlib/count.h"
968#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300969#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200970#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200971#include "stringlib/undef.h"
972
Inada Naoki2c4928d2020-06-17 20:09:44 +0900973_Py_COMP_DIAG_PUSH
974_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200975#include "stringlib/unicodedefs.h"
976#include "stringlib/fastsearch.h"
977#include "stringlib/count.h"
978#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100979#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900980_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200981
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200982#undef STRINGLIB_GET_EMPTY
983
Guido van Rossumd57fd912000-03-10 22:53:23 +0000984/* --- Unicode Object ----------------------------------------------------- */
985
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700986static inline Py_ssize_t
987findchar(const void *s, int kind,
988 Py_ssize_t size, Py_UCS4 ch,
989 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200991 switch (kind) {
992 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200993 if ((Py_UCS1) ch != ch)
994 return -1;
995 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600996 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200997 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200999 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001000 if ((Py_UCS2) ch != ch)
1001 return -1;
1002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001007 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001008 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001009 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001010 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001011 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001012 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001014}
1015
Victor Stinnerafffce42012-10-03 23:03:17 +02001016#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001017/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001018 earlier.
1019
1020 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1021 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1022 invalid character in Unicode 6.0. */
1023static void
1024unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1025{
1026 int kind = PyUnicode_KIND(unicode);
1027 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1028 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1029 if (length <= old_length)
1030 return;
1031 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1032}
1033#endif
1034
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035static PyObject*
1036resize_compact(PyObject *unicode, Py_ssize_t length)
1037{
1038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040 Py_ssize_t new_size;
1041 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001042 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001043#ifdef Py_DEBUG
1044 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1045#endif
1046
Victor Stinner79891572012-05-03 13:43:07 +02001047 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001049 assert(PyUnicode_IS_COMPACT(unicode));
1050
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001051 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001052 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 struct_size = sizeof(PyASCIIObject);
1054 else
1055 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001056 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001057
Victor Stinnerfe226c02011-10-03 03:52:20 +02001058 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1059 PyErr_NoMemory();
1060 return NULL;
1061 }
1062 new_size = (struct_size + (length + 1) * char_size);
1063
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001064 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001065 PyObject_Free(_PyUnicode_UTF8(unicode));
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001066 _PyUnicode_UTF8(unicode) = NULL;
1067 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1068 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#ifdef Py_REF_DEBUG
1070 _Py_RefTotal--;
1071#endif
1072#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001073 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001074#endif
Victor Stinner84def372011-12-11 20:04:56 +01001075
Victor Stinner32bd68c2020-12-01 10:37:39 +01001076 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001077 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001078 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001079 PyErr_NoMemory();
1080 return NULL;
1081 }
Victor Stinner84def372011-12-11 20:04:56 +01001082 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001083 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001084
Victor Stinnerfe226c02011-10-03 03:52:20 +02001085 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001086 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001087 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001088 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001089 _PyUnicode_WSTR_LENGTH(unicode) = length;
1090 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001092 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001093 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001094 if (!PyUnicode_IS_ASCII(unicode))
1095 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001096 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001097#ifdef Py_DEBUG
1098 unicode_fill_invalid(unicode, old_length);
1099#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001100 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1101 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001102 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001103 return unicode;
1104}
1105
Alexander Belopolsky40018472011-02-26 01:02:56 +00001106static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001107resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108{
Victor Stinner95663112011-10-04 01:03:50 +02001109 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001110 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001113
Victor Stinnerfe226c02011-10-03 03:52:20 +02001114 if (PyUnicode_IS_READY(unicode)) {
1115 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001116 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001117 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001118#ifdef Py_DEBUG
1119 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1120#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001123 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001124 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1125 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001126
1127 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1128 PyErr_NoMemory();
1129 return -1;
1130 }
1131 new_size = (length + 1) * char_size;
1132
Victor Stinner7a9105a2011-12-12 00:13:42 +01001133 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1134 {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001135 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinner7a9105a2011-12-12 00:13:42 +01001136 _PyUnicode_UTF8(unicode) = NULL;
1137 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1138 }
1139
Victor Stinner32bd68c2020-12-01 10:37:39 +01001140 data = (PyObject *)PyObject_Realloc(data, new_size);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001141 if (data == NULL) {
1142 PyErr_NoMemory();
1143 return -1;
1144 }
1145 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001146 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001147 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001148 _PyUnicode_WSTR_LENGTH(unicode) = length;
1149 }
1150 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001151 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001152 _PyUnicode_UTF8_LENGTH(unicode) = length;
1153 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001154 _PyUnicode_LENGTH(unicode) = length;
1155 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001156#ifdef Py_DEBUG
1157 unicode_fill_invalid(unicode, old_length);
1158#endif
Victor Stinner95663112011-10-04 01:03:50 +02001159 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001160 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001161 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001162 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001163 }
Victor Stinner95663112011-10-04 01:03:50 +02001164 assert(_PyUnicode_WSTR(unicode) != NULL);
1165
1166 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001167 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001168 PyErr_NoMemory();
1169 return -1;
1170 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001171 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001172 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001173 wstr = PyObject_Realloc(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001174 if (!wstr) {
1175 PyErr_NoMemory();
1176 return -1;
1177 }
1178 _PyUnicode_WSTR(unicode) = wstr;
1179 _PyUnicode_WSTR(unicode)[length] = 0;
1180 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001181 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 return 0;
1183}
1184
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185static PyObject*
1186resize_copy(PyObject *unicode, Py_ssize_t length)
1187{
1188 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001189 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001190 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001191
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001192 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001193
1194 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1195 if (copy == NULL)
1196 return NULL;
1197
1198 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001199 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001200 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001201 }
1202 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001203 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001204
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001205 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001206 if (w == NULL)
1207 return NULL;
1208 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1209 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001210 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001211 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001212 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001213 }
1214}
1215
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001217 Ux0000 terminated; some code (e.g. new_identifier)
1218 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219
1220 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001221 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222
1223*/
1224
Alexander Belopolsky40018472011-02-26 01:02:56 +00001225static PyUnicodeObject *
1226_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001228 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001229 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230
Thomas Wouters477c8d52006-05-27 19:21:47 +00001231 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001232 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001233 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 }
1235
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001236 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001237 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001238 return (PyUnicodeObject *)PyErr_NoMemory();
1239 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001240 if (length < 0) {
1241 PyErr_SetString(PyExc_SystemError,
1242 "Negative size passed to _PyUnicode_New");
1243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 }
1245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1247 if (unicode == NULL)
1248 return NULL;
1249 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001250
1251 _PyUnicode_WSTR_LENGTH(unicode) = length;
1252 _PyUnicode_HASH(unicode) = -1;
1253 _PyUnicode_STATE(unicode).interned = 0;
1254 _PyUnicode_STATE(unicode).kind = 0;
1255 _PyUnicode_STATE(unicode).compact = 0;
1256 _PyUnicode_STATE(unicode).ready = 0;
1257 _PyUnicode_STATE(unicode).ascii = 0;
1258 _PyUnicode_DATA_ANY(unicode) = NULL;
1259 _PyUnicode_LENGTH(unicode) = 0;
1260 _PyUnicode_UTF8(unicode) = NULL;
1261 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1262
Victor Stinner32bd68c2020-12-01 10:37:39 +01001263 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001265 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001266 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001267 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001268 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269
Jeremy Hyltond8082792003-09-16 19:41:39 +00001270 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001271 * the caller fails before initializing str -- unicode_resize()
1272 * reads str[0], and the Keep-Alive optimization can keep memory
1273 * allocated for str alive across a call to unicode_dealloc(unicode).
1274 * We don't want unicode_resize to read uninitialized memory in
1275 * that case.
1276 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001277 _PyUnicode_WSTR(unicode)[0] = 0;
1278 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001279
Victor Stinner7931d9a2011-11-04 00:22:48 +01001280 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 return unicode;
1282}
1283
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284static const char*
1285unicode_kind_name(PyObject *unicode)
1286{
Victor Stinner42dfd712011-10-03 14:41:45 +02001287 /* don't check consistency: unicode_kind_name() is called from
1288 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 if (!PyUnicode_IS_COMPACT(unicode))
1290 {
1291 if (!PyUnicode_IS_READY(unicode))
1292 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001293 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001294 {
1295 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001296 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001297 return "legacy ascii";
1298 else
1299 return "legacy latin1";
1300 case PyUnicode_2BYTE_KIND:
1301 return "legacy UCS2";
1302 case PyUnicode_4BYTE_KIND:
1303 return "legacy UCS4";
1304 default:
1305 return "<legacy invalid kind>";
1306 }
1307 }
1308 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001309 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001310 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001311 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001312 return "ascii";
1313 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001314 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001315 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001316 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001317 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001318 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001319 default:
1320 return "<invalid compact kind>";
1321 }
1322}
1323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001326const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001327 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001328 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329}
1330
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001331const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001332 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 return _PyUnicode_COMPACT_DATA(unicode);
1334}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001335const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001336 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001337 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1339 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1340 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1341 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1342 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1343 return PyUnicode_DATA(unicode);
1344}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001345
1346void
1347_PyUnicode_Dump(PyObject *op)
1348{
1349 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001350 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1351 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001352 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001353
Victor Stinnera849a4b2011-10-03 12:12:11 +02001354 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001355 {
1356 if (ascii->state.ascii)
1357 data = (ascii + 1);
1358 else
1359 data = (compact + 1);
1360 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001361 else
1362 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001363 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001364
Victor Stinnera849a4b2011-10-03 12:12:11 +02001365 if (ascii->wstr == data)
1366 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001367 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001368
Victor Stinnera3b334d2011-10-03 13:53:37 +02001369 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001370 printf(" (%zu), ", compact->wstr_length);
1371 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001372 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001373 }
1374 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001375 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001376 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001377}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378#endif
1379
Victor Stinner91698d82020-06-25 14:07:40 +02001380static int
1381unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1382{
1383 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1384 // optimized to always use state->empty_string without having to check if
1385 // it is NULL or not.
1386 PyObject *empty = PyUnicode_New(1, 0);
1387 if (empty == NULL) {
1388 return -1;
1389 }
1390 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1391 _PyUnicode_LENGTH(empty) = 0;
1392 assert(_PyUnicode_CheckConsistency(empty, 1));
1393
1394 assert(state->empty_string == NULL);
1395 state->empty_string = empty;
1396 return 0;
1397}
1398
1399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400PyObject *
1401PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1402{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001403 /* Optimization for empty strings */
1404 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001405 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001406 }
1407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 PyObject *obj;
1409 PyCompactUnicodeObject *unicode;
1410 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001411 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001412 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 Py_ssize_t char_size;
1414 Py_ssize_t struct_size;
1415
Victor Stinner9e9d6892011-10-04 01:02:02 +02001416 is_ascii = 0;
1417 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 struct_size = sizeof(PyCompactUnicodeObject);
1419 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001420 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 char_size = 1;
1422 is_ascii = 1;
1423 struct_size = sizeof(PyASCIIObject);
1424 }
1425 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001426 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 char_size = 1;
1428 }
1429 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001430 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001431 char_size = 2;
1432 if (sizeof(wchar_t) == 2)
1433 is_sharing = 1;
1434 }
1435 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001436 if (maxchar > MAX_UNICODE) {
1437 PyErr_SetString(PyExc_SystemError,
1438 "invalid maximum character passed to PyUnicode_New");
1439 return NULL;
1440 }
Victor Stinner8f825062012-04-27 13:55:39 +02001441 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 char_size = 4;
1443 if (sizeof(wchar_t) == 4)
1444 is_sharing = 1;
1445 }
1446
1447 /* Ensure we won't overflow the size. */
1448 if (size < 0) {
1449 PyErr_SetString(PyExc_SystemError,
1450 "Negative size passed to PyUnicode_New");
1451 return NULL;
1452 }
1453 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1454 return PyErr_NoMemory();
1455
1456 /* Duplicated allocation code from _PyObject_New() instead of a call to
1457 * PyObject_New() so we are able to allocate space for the object and
1458 * it's data buffer.
1459 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001460 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001461 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001463 }
1464 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465
1466 unicode = (PyCompactUnicodeObject *)obj;
1467 if (is_ascii)
1468 data = ((PyASCIIObject*)obj) + 1;
1469 else
1470 data = unicode + 1;
1471 _PyUnicode_LENGTH(unicode) = size;
1472 _PyUnicode_HASH(unicode) = -1;
1473 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001474 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 _PyUnicode_STATE(unicode).compact = 1;
1476 _PyUnicode_STATE(unicode).ready = 1;
1477 _PyUnicode_STATE(unicode).ascii = is_ascii;
1478 if (is_ascii) {
1479 ((char*)data)[size] = 0;
1480 _PyUnicode_WSTR(unicode) = NULL;
1481 }
Victor Stinner8f825062012-04-27 13:55:39 +02001482 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 ((char*)data)[size] = 0;
1484 _PyUnicode_WSTR(unicode) = NULL;
1485 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001487 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 else {
1490 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001491 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001492 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001494 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 ((Py_UCS4*)data)[size] = 0;
1496 if (is_sharing) {
1497 _PyUnicode_WSTR_LENGTH(unicode) = size;
1498 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1499 }
1500 else {
1501 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1502 _PyUnicode_WSTR(unicode) = NULL;
1503 }
1504 }
Victor Stinner8f825062012-04-27 13:55:39 +02001505#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001506 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001507#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001508 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001509 return obj;
1510}
1511
1512#if SIZEOF_WCHAR_T == 2
1513/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1514 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001515 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516
1517 This function assumes that unicode can hold one more code point than wstr
1518 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001519static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001521 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522{
1523 const wchar_t *iter;
1524 Py_UCS4 *ucs4_out;
1525
Victor Stinner910337b2011-10-03 03:20:16 +02001526 assert(unicode != NULL);
1527 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1529 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1530
1531 for (iter = begin; iter < end; ) {
1532 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1533 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001534 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1535 && (iter+1) < end
1536 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 {
Victor Stinner551ac952011-11-29 22:58:13 +01001538 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539 iter += 2;
1540 }
1541 else {
1542 *ucs4_out++ = *iter;
1543 iter++;
1544 }
1545 }
1546 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1547 _PyUnicode_GET_LENGTH(unicode)));
1548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549}
1550#endif
1551
Victor Stinnercd9950f2011-10-02 00:34:53 +02001552static int
Victor Stinner488fa492011-12-12 00:01:39 +01001553unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001554{
Victor Stinner488fa492011-12-12 00:01:39 +01001555 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001556 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001557 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001558 return -1;
1559 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001560 return 0;
1561}
1562
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563static int
1564_copy_characters(PyObject *to, Py_ssize_t to_start,
1565 PyObject *from, Py_ssize_t from_start,
1566 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001568 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001569 const void *from_data;
1570 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001571
Victor Stinneree4544c2012-05-09 22:24:08 +02001572 assert(0 <= how_many);
1573 assert(0 <= from_start);
1574 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001575 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001576 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001577 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578
Victor Stinnerd3f08822012-05-29 12:57:52 +02001579 assert(PyUnicode_Check(to));
1580 assert(PyUnicode_IS_READY(to));
1581 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1582
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001583 if (how_many == 0)
1584 return 0;
1585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001587 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001589 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590
Victor Stinnerf1852262012-06-16 16:38:26 +02001591#ifdef Py_DEBUG
1592 if (!check_maxchar
1593 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1594 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001595 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001596 Py_UCS4 ch;
1597 Py_ssize_t i;
1598 for (i=0; i < how_many; i++) {
1599 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1600 assert(ch <= to_maxchar);
1601 }
1602 }
1603#endif
1604
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001605 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001606 if (check_maxchar
1607 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1608 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001609 /* Writing Latin-1 characters into an ASCII string requires to
1610 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001611 Py_UCS4 max_char;
1612 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001613 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001614 if (max_char >= 128)
1615 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001616 }
Christian Heimesf051e432016-09-13 20:22:02 +02001617 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001618 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001619 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001621 else if (from_kind == PyUnicode_1BYTE_KIND
1622 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001623 {
1624 _PyUnicode_CONVERT_BYTES(
1625 Py_UCS1, Py_UCS2,
1626 PyUnicode_1BYTE_DATA(from) + from_start,
1627 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1628 PyUnicode_2BYTE_DATA(to) + to_start
1629 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001630 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001631 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001632 && to_kind == PyUnicode_4BYTE_KIND)
1633 {
1634 _PyUnicode_CONVERT_BYTES(
1635 Py_UCS1, Py_UCS4,
1636 PyUnicode_1BYTE_DATA(from) + from_start,
1637 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1638 PyUnicode_4BYTE_DATA(to) + to_start
1639 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001640 }
1641 else if (from_kind == PyUnicode_2BYTE_KIND
1642 && to_kind == PyUnicode_4BYTE_KIND)
1643 {
1644 _PyUnicode_CONVERT_BYTES(
1645 Py_UCS2, Py_UCS4,
1646 PyUnicode_2BYTE_DATA(from) + from_start,
1647 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1648 PyUnicode_4BYTE_DATA(to) + to_start
1649 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001650 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001651 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001652 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1653
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001654 if (!check_maxchar) {
1655 if (from_kind == PyUnicode_2BYTE_KIND
1656 && to_kind == PyUnicode_1BYTE_KIND)
1657 {
1658 _PyUnicode_CONVERT_BYTES(
1659 Py_UCS2, Py_UCS1,
1660 PyUnicode_2BYTE_DATA(from) + from_start,
1661 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1662 PyUnicode_1BYTE_DATA(to) + to_start
1663 );
1664 }
1665 else if (from_kind == PyUnicode_4BYTE_KIND
1666 && to_kind == PyUnicode_1BYTE_KIND)
1667 {
1668 _PyUnicode_CONVERT_BYTES(
1669 Py_UCS4, Py_UCS1,
1670 PyUnicode_4BYTE_DATA(from) + from_start,
1671 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1672 PyUnicode_1BYTE_DATA(to) + to_start
1673 );
1674 }
1675 else if (from_kind == PyUnicode_4BYTE_KIND
1676 && to_kind == PyUnicode_2BYTE_KIND)
1677 {
1678 _PyUnicode_CONVERT_BYTES(
1679 Py_UCS4, Py_UCS2,
1680 PyUnicode_4BYTE_DATA(from) + from_start,
1681 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1682 PyUnicode_2BYTE_DATA(to) + to_start
1683 );
1684 }
1685 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001686 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001687 }
1688 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001689 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001690 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001691 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001692 Py_ssize_t i;
1693
Victor Stinnera0702ab2011-09-29 14:14:38 +02001694 for (i=0; i < how_many; i++) {
1695 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001696 if (ch > to_maxchar)
1697 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001698 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1699 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001700 }
1701 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001702 return 0;
1703}
1704
Victor Stinnerd3f08822012-05-29 12:57:52 +02001705void
1706_PyUnicode_FastCopyCharacters(
1707 PyObject *to, Py_ssize_t to_start,
1708 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001709{
1710 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1711}
1712
1713Py_ssize_t
1714PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1715 PyObject *from, Py_ssize_t from_start,
1716 Py_ssize_t how_many)
1717{
1718 int err;
1719
1720 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1721 PyErr_BadInternalCall();
1722 return -1;
1723 }
1724
Benjamin Petersonbac79492012-01-14 13:34:47 -05001725 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001726 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001727 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001728 return -1;
1729
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001730 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001731 PyErr_SetString(PyExc_IndexError, "string index out of range");
1732 return -1;
1733 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001734 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001735 PyErr_SetString(PyExc_IndexError, "string index out of range");
1736 return -1;
1737 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001738 if (how_many < 0) {
1739 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1740 return -1;
1741 }
1742 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001743 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1744 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001745 "Cannot write %zi characters at %zi "
1746 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001747 how_many, to_start, PyUnicode_GET_LENGTH(to));
1748 return -1;
1749 }
1750
1751 if (how_many == 0)
1752 return 0;
1753
Victor Stinner488fa492011-12-12 00:01:39 +01001754 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001755 return -1;
1756
1757 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1758 if (err) {
1759 PyErr_Format(PyExc_SystemError,
1760 "Cannot copy %s characters "
1761 "into a string of %s characters",
1762 unicode_kind_name(from),
1763 unicode_kind_name(to));
1764 return -1;
1765 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001766 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767}
1768
Victor Stinner17222162011-09-28 22:15:37 +02001769/* Find the maximum code point and count the number of surrogate pairs so a
1770 correct string length can be computed before converting a string to UCS4.
1771 This function counts single surrogates as a character and not as a pair.
1772
1773 Return 0 on success, or -1 on error. */
1774static int
1775find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1776 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777{
1778 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001779 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780
Victor Stinnerc53be962011-10-02 21:33:54 +02001781 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 *num_surrogates = 0;
1783 *maxchar = 0;
1784
1785 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001787 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1788 && (iter+1) < end
1789 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1790 {
1791 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1792 ++(*num_surrogates);
1793 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 }
1795 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001797 {
1798 ch = *iter;
1799 iter++;
1800 }
1801 if (ch > *maxchar) {
1802 *maxchar = ch;
1803 if (*maxchar > MAX_UNICODE) {
1804 PyErr_Format(PyExc_ValueError,
1805 "character U+%x is not in range [U+0000; U+10ffff]",
1806 ch);
1807 return -1;
1808 }
1809 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 }
1811 return 0;
1812}
1813
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001814int
1815_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816{
1817 wchar_t *end;
1818 Py_UCS4 maxchar = 0;
1819 Py_ssize_t num_surrogates;
1820#if SIZEOF_WCHAR_T == 2
1821 Py_ssize_t length_wo_surrogates;
1822#endif
1823
Georg Brandl7597add2011-10-05 16:36:47 +02001824 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001825 strings were created using _PyObject_New() and where no canonical
1826 representation (the str field) has been set yet aka strings
1827 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001828 assert(_PyUnicode_CHECK(unicode));
1829 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001831 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001832 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001833 /* Actually, it should neither be interned nor be anything else: */
1834 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001837 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001838 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840
1841 if (maxchar < 256) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001842 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001843 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 PyErr_NoMemory();
1845 return -1;
1846 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001847 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 _PyUnicode_WSTR(unicode), end,
1849 PyUnicode_1BYTE_DATA(unicode));
1850 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1851 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1852 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1853 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001854 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001855 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001856 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 }
1858 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001859 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001860 _PyUnicode_UTF8(unicode) = NULL;
1861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001863 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 _PyUnicode_WSTR(unicode) = NULL;
1865 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1866 }
1867 /* In this case we might have to convert down from 4-byte native
1868 wchar_t to 2-byte unicode. */
1869 else if (maxchar < 65536) {
1870 assert(num_surrogates == 0 &&
1871 "FindMaxCharAndNumSurrogatePairs() messed up");
1872
Victor Stinner506f5922011-09-28 22:34:18 +02001873#if SIZEOF_WCHAR_T == 2
1874 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001875 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001876 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1877 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1878 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001879 _PyUnicode_UTF8(unicode) = NULL;
1880 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001881#else
1882 /* sizeof(wchar_t) == 4 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001883 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
Victor Stinner506f5922011-09-28 22:34:18 +02001884 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001885 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001886 PyErr_NoMemory();
1887 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 }
Victor Stinner506f5922011-09-28 22:34:18 +02001889 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1890 _PyUnicode_WSTR(unicode), end,
1891 PyUnicode_2BYTE_DATA(unicode));
1892 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1893 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1894 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001895 _PyUnicode_UTF8(unicode) = NULL;
1896 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner32bd68c2020-12-01 10:37:39 +01001897 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinner506f5922011-09-28 22:34:18 +02001898 _PyUnicode_WSTR(unicode) = NULL;
1899 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1900#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 }
Ikko Ashimine38811d62020-11-10 14:57:34 +09001902 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 else {
1904#if SIZEOF_WCHAR_T == 2
1905 /* in case the native representation is 2-bytes, we need to allocate a
1906 new normalized 4-byte version. */
1907 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001908 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1909 PyErr_NoMemory();
1910 return -1;
1911 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001912 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001913 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 PyErr_NoMemory();
1915 return -1;
1916 }
1917 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1918 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001919 _PyUnicode_UTF8(unicode) = NULL;
1920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001921 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1922 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001923 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001924 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001925 _PyUnicode_WSTR(unicode) = NULL;
1926 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1927#else
1928 assert(num_surrogates == 0);
1929
Victor Stinnerc3c74152011-10-02 20:39:55 +02001930 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001931 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001932 _PyUnicode_UTF8(unicode) = NULL;
1933 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1935#endif
1936 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1937 }
1938 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001939 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 return 0;
1941}
1942
Alexander Belopolsky40018472011-02-26 01:02:56 +00001943static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001944unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945{
Walter Dörwald16807132007-05-25 13:52:07 +00001946 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001947 case SSTATE_NOT_INTERNED:
1948 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001949
Benjamin Peterson29060642009-01-31 22:14:21 +00001950 case SSTATE_INTERNED_MORTAL:
Victor Stinner607b1022020-05-05 18:50:30 +02001951#ifdef INTERNED_STRINGS
Victor Stinner3549ca32020-07-03 16:59:12 +02001952 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1953 references (key and value) which were ignored by
1954 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1955 to prevent calling unicode_dealloc() again. Adjust refcnt after
1956 PyDict_DelItem(). */
1957 assert(Py_REFCNT(unicode) == 0);
1958 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001959 if (PyDict_DelItem(interned, unicode) != 0) {
1960 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1961 NULL);
1962 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001963 assert(Py_REFCNT(unicode) == 1);
1964 Py_SET_REFCNT(unicode, 0);
Victor Stinner607b1022020-05-05 18:50:30 +02001965#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001966 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001967
Benjamin Peterson29060642009-01-31 22:14:21 +00001968 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001969 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1970 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001971
Benjamin Peterson29060642009-01-31 22:14:21 +00001972 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001973 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001974 }
1975
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001976 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001977 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001978 }
1979 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001980 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001981 }
1982 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001983 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001986 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987}
1988
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001989#ifdef Py_DEBUG
1990static int
1991unicode_is_singleton(PyObject *unicode)
1992{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001993 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001994 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001995 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001996 }
Victor Stinner607b1022020-05-05 18:50:30 +02001997 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001998 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1999 {
2000 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002001 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002002 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02002003 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002004 }
2005 return 0;
2006}
2007#endif
2008
Alexander Belopolsky40018472011-02-26 01:02:56 +00002009static int
Victor Stinner488fa492011-12-12 00:01:39 +01002010unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002011{
Victor Stinner488fa492011-12-12 00:01:39 +01002012 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002013 if (Py_REFCNT(unicode) != 1)
2014 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002015 if (_PyUnicode_HASH(unicode) != -1)
2016 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002017 if (PyUnicode_CHECK_INTERNED(unicode))
2018 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002019 if (!PyUnicode_CheckExact(unicode))
2020 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002021#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002022 /* singleton refcount is greater than 1 */
2023 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002024#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002025 return 1;
2026}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002027
Victor Stinnerfe226c02011-10-03 03:52:20 +02002028static int
2029unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2030{
2031 PyObject *unicode;
2032 Py_ssize_t old_length;
2033
2034 assert(p_unicode != NULL);
2035 unicode = *p_unicode;
2036
2037 assert(unicode != NULL);
2038 assert(PyUnicode_Check(unicode));
2039 assert(0 <= length);
2040
Victor Stinner910337b2011-10-03 03:20:16 +02002041 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002042 old_length = PyUnicode_WSTR_LENGTH(unicode);
2043 else
2044 old_length = PyUnicode_GET_LENGTH(unicode);
2045 if (old_length == length)
2046 return 0;
2047
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002048 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002049 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002050 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002051 return 0;
2052 }
2053
Victor Stinner488fa492011-12-12 00:01:39 +01002054 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002055 PyObject *copy = resize_copy(unicode, length);
2056 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002057 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002058 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002060 }
2061
Victor Stinnerfe226c02011-10-03 03:52:20 +02002062 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002063 PyObject *new_unicode = resize_compact(unicode, length);
2064 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002065 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002066 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002067 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002068 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002069 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002070}
2071
Alexander Belopolsky40018472011-02-26 01:02:56 +00002072int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002073PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002074{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002075 PyObject *unicode;
2076 if (p_unicode == NULL) {
2077 PyErr_BadInternalCall();
2078 return -1;
2079 }
2080 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002081 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002082 {
2083 PyErr_BadInternalCall();
2084 return -1;
2085 }
2086 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002087}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002088
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002089/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002090
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002091 WARNING: The function doesn't copy the terminating null character and
2092 doesn't check the maximum character (may write a latin1 character in an
2093 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002094static void
2095unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2096 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002097{
2098 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002099 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002100 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002101
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002102 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002103 switch (kind) {
2104 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002105#ifdef Py_DEBUG
2106 if (PyUnicode_IS_ASCII(unicode)) {
2107 Py_UCS4 maxchar = ucs1lib_find_max_char(
2108 (const Py_UCS1*)str,
2109 (const Py_UCS1*)str + len);
2110 assert(maxchar < 128);
2111 }
2112#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002113 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002114 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002115 }
2116 case PyUnicode_2BYTE_KIND: {
2117 Py_UCS2 *start = (Py_UCS2 *)data + index;
2118 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002119
Victor Stinner184252a2012-06-16 02:57:41 +02002120 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002121 *ucs2 = (Py_UCS2)*str;
2122
2123 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002124 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002125 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002126 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002127 Py_UCS4 *start = (Py_UCS4 *)data + index;
2128 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002129
Victor Stinner184252a2012-06-16 02:57:41 +02002130 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002131 *ucs4 = (Py_UCS4)*str;
2132
2133 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002134 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002135 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002136 default:
2137 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002138 }
2139}
2140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002141static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002142get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002144 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002145
Victor Stinner2f9ada92020-06-24 02:22:21 +02002146 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002147 if (unicode) {
2148 Py_INCREF(unicode);
2149 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002150 }
Victor Stinner607b1022020-05-05 18:50:30 +02002151
2152 unicode = PyUnicode_New(1, ch);
2153 if (!unicode) {
2154 return NULL;
2155 }
2156
2157 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2158 assert(_PyUnicode_CheckConsistency(unicode, 1));
2159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002161 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002162 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163}
2164
Victor Stinner985a82a2014-01-03 12:53:47 +01002165static PyObject*
2166unicode_char(Py_UCS4 ch)
2167{
2168 PyObject *unicode;
2169
2170 assert(ch <= MAX_UNICODE);
2171
Victor Stinner2f9ada92020-06-24 02:22:21 +02002172 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002173 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002174 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002175
Victor Stinner985a82a2014-01-03 12:53:47 +01002176 unicode = PyUnicode_New(1, ch);
2177 if (unicode == NULL)
2178 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002179
2180 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2181 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002182 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002183 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002184 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2185 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2186 }
2187 assert(_PyUnicode_CheckConsistency(unicode, 1));
2188 return unicode;
2189}
2190
Alexander Belopolsky40018472011-02-26 01:02:56 +00002191PyObject *
2192PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002194 if (u == NULL) {
2195 if (size > 0) {
2196 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2197 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2198 "use PyUnicode_New() instead", 1) < 0) {
2199 return NULL;
2200 }
2201 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002202 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002203 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002204
2205 if (size < 0) {
2206 PyErr_BadInternalCall();
2207 return NULL;
2208 }
2209
2210 return PyUnicode_FromWideChar(u, size);
2211}
2212
2213PyObject *
2214PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2215{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002216 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 Py_UCS4 maxchar = 0;
2218 Py_ssize_t num_surrogates;
2219
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002220 if (u == NULL && size != 0) {
2221 PyErr_BadInternalCall();
2222 return NULL;
2223 }
2224
2225 if (size == -1) {
2226 size = wcslen(u);
2227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002229 /* If the Unicode data is known at construction time, we can apply
2230 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002233 if (size == 0)
2234 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236 /* Single character Unicode objects in the Latin-1 range are
2237 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002238 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 return get_latin1_char((unsigned char)*u);
2240
2241 /* If not empty and not single character, copy the Unicode data
2242 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002243 if (find_maxchar_surrogates(u, u + size,
2244 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 return NULL;
2246
Victor Stinner8faf8212011-12-08 22:14:11 +01002247 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 if (!unicode)
2249 return NULL;
2250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 switch (PyUnicode_KIND(unicode)) {
2252 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002253 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2255 break;
2256 case PyUnicode_2BYTE_KIND:
2257#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002258 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002260 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2262#endif
2263 break;
2264 case PyUnicode_4BYTE_KIND:
2265#if SIZEOF_WCHAR_T == 2
2266 /* This is the only case which has to process surrogates, thus
2267 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002268 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269#else
2270 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002271 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272#endif
2273 break;
2274 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002275 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002278 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279}
2280
Alexander Belopolsky40018472011-02-26 01:02:56 +00002281PyObject *
2282PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002283{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 if (size < 0) {
2285 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002286 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002287 return NULL;
2288 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002289 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002290 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002291 }
2292 else {
2293 if (size > 0) {
2294 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2295 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2296 "use PyUnicode_New() instead", 1) < 0) {
2297 return NULL;
2298 }
2299 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002300 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002301 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002302}
2303
Alexander Belopolsky40018472011-02-26 01:02:56 +00002304PyObject *
2305PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002306{
2307 size_t size = strlen(u);
2308 if (size > PY_SSIZE_T_MAX) {
2309 PyErr_SetString(PyExc_OverflowError, "input too long");
2310 return NULL;
2311 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002312 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002313}
2314
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002315PyObject *
2316_PyUnicode_FromId(_Py_Identifier *id)
2317{
Victor Stinner297257f2020-06-02 14:39:45 +02002318 if (id->object) {
2319 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002320 }
Victor Stinner297257f2020-06-02 14:39:45 +02002321
2322 PyObject *obj;
2323 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2324 strlen(id->string),
2325 NULL, NULL);
2326 if (!obj) {
2327 return NULL;
2328 }
2329 PyUnicode_InternInPlace(&obj);
2330
2331 assert(!id->next);
2332 id->object = obj;
2333 id->next = static_strings;
2334 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002335 return id->object;
2336}
2337
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002338static void
2339unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002340{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002341 _Py_Identifier *tmp, *s = static_strings;
2342 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002343 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002344 tmp = s->next;
2345 s->next = NULL;
2346 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002347 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002348 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002349}
2350
Benjamin Peterson0df54292012-03-26 14:50:32 -04002351/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002352
Victor Stinnerd3f08822012-05-29 12:57:52 +02002353PyObject*
2354_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002355{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002356 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002357 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002358 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002359#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002360 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002361#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002362 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002363 }
Victor Stinner785938e2011-12-11 20:09:03 +01002364 unicode = PyUnicode_New(size, 127);
2365 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002366 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002367 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2368 assert(_PyUnicode_CheckConsistency(unicode, 1));
2369 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002370}
2371
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002372static Py_UCS4
2373kind_maxchar_limit(unsigned int kind)
2374{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002375 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002376 case PyUnicode_1BYTE_KIND:
2377 return 0x80;
2378 case PyUnicode_2BYTE_KIND:
2379 return 0x100;
2380 case PyUnicode_4BYTE_KIND:
2381 return 0x10000;
2382 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002383 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002384 }
2385}
2386
Victor Stinner702c7342011-10-05 13:50:52 +02002387static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002388_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002389{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002391 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002392
Victor Stinner2f9ada92020-06-24 02:22:21 +02002393 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002394 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002395 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002396 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002397 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002398 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002399 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002400
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002401 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002402 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 if (!res)
2404 return NULL;
2405 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002406 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002408}
2409
Victor Stinnere57b1c02011-09-28 22:20:48 +02002410static PyObject*
2411_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412{
2413 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002414 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002415
Serhiy Storchaka678db842013-01-26 12:16:36 +02002416 if (size == 0)
2417 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002418 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002419 if (size == 1)
2420 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002421
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002422 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002423 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 if (!res)
2425 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002426 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002427 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002428 else {
2429 _PyUnicode_CONVERT_BYTES(
2430 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2431 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002432 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 return res;
2434}
2435
Victor Stinnere57b1c02011-09-28 22:20:48 +02002436static PyObject*
2437_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438{
2439 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002440 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002441
Serhiy Storchaka678db842013-01-26 12:16:36 +02002442 if (size == 0)
2443 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002444 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002445 if (size == 1)
2446 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002447
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002448 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002449 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 if (!res)
2451 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002452 if (max_char < 256)
2453 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2454 PyUnicode_1BYTE_DATA(res));
2455 else if (max_char < 0x10000)
2456 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2457 PyUnicode_2BYTE_DATA(res));
2458 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002460 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 return res;
2462}
2463
2464PyObject*
2465PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2466{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002467 if (size < 0) {
2468 PyErr_SetString(PyExc_ValueError, "size must be positive");
2469 return NULL;
2470 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002471 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002473 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002475 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002477 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002478 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002479 PyErr_SetString(PyExc_SystemError, "invalid kind");
2480 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002482}
2483
Victor Stinnerece58de2012-04-23 23:36:38 +02002484Py_UCS4
2485_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2486{
2487 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002488 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002489
2490 assert(PyUnicode_IS_READY(unicode));
2491 assert(0 <= start);
2492 assert(end <= PyUnicode_GET_LENGTH(unicode));
2493 assert(start <= end);
2494
2495 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2496 return PyUnicode_MAX_CHAR_VALUE(unicode);
2497
2498 if (start == end)
2499 return 127;
2500
Victor Stinner94d558b2012-04-27 22:26:58 +02002501 if (PyUnicode_IS_ASCII(unicode))
2502 return 127;
2503
Victor Stinnerece58de2012-04-23 23:36:38 +02002504 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002505 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002506 endptr = (char *)startptr + end * kind;
2507 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002508 switch(kind) {
2509 case PyUnicode_1BYTE_KIND:
2510 return ucs1lib_find_max_char(startptr, endptr);
2511 case PyUnicode_2BYTE_KIND:
2512 return ucs2lib_find_max_char(startptr, endptr);
2513 case PyUnicode_4BYTE_KIND:
2514 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002515 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002516 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002517 }
2518}
2519
Victor Stinner25a4b292011-10-06 12:31:55 +02002520/* Ensure that a string uses the most efficient storage, if it is not the
2521 case: create a new string with of the right kind. Write NULL into *p_unicode
2522 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002523static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002524unicode_adjust_maxchar(PyObject **p_unicode)
2525{
2526 PyObject *unicode, *copy;
2527 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002528 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002529 unsigned int kind;
2530
2531 assert(p_unicode != NULL);
2532 unicode = *p_unicode;
2533 assert(PyUnicode_IS_READY(unicode));
2534 if (PyUnicode_IS_ASCII(unicode))
2535 return;
2536
2537 len = PyUnicode_GET_LENGTH(unicode);
2538 kind = PyUnicode_KIND(unicode);
2539 if (kind == PyUnicode_1BYTE_KIND) {
2540 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002541 max_char = ucs1lib_find_max_char(u, u + len);
2542 if (max_char >= 128)
2543 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002544 }
2545 else if (kind == PyUnicode_2BYTE_KIND) {
2546 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002547 max_char = ucs2lib_find_max_char(u, u + len);
2548 if (max_char >= 256)
2549 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002550 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002551 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002552 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002553 max_char = ucs4lib_find_max_char(u, u + len);
2554 if (max_char >= 0x10000)
2555 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002556 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002557 else
2558 Py_UNREACHABLE();
2559
Victor Stinner25a4b292011-10-06 12:31:55 +02002560 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002561 if (copy != NULL)
2562 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002563 Py_DECREF(unicode);
2564 *p_unicode = copy;
2565}
2566
Victor Stinner034f6cf2011-09-30 02:26:44 +02002567PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002568_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002569{
Victor Stinner87af4f22011-11-21 23:03:47 +01002570 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002571 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002572
Victor Stinner034f6cf2011-09-30 02:26:44 +02002573 if (!PyUnicode_Check(unicode)) {
2574 PyErr_BadInternalCall();
2575 return NULL;
2576 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002577 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002578 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002579
Victor Stinner87af4f22011-11-21 23:03:47 +01002580 length = PyUnicode_GET_LENGTH(unicode);
2581 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002582 if (!copy)
2583 return NULL;
2584 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2585
Christian Heimesf051e432016-09-13 20:22:02 +02002586 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002587 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002588 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002589 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002590}
2591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592
Victor Stinnerbc603d12011-10-02 01:00:40 +02002593/* Widen Unicode objects to larger buffers. Don't write terminating null
2594 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002596static void*
2597unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002599 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002600
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002601 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002602 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002603 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002604 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002605 if (!result)
2606 return PyErr_NoMemory();
2607 assert(skind == PyUnicode_1BYTE_KIND);
2608 _PyUnicode_CONVERT_BYTES(
2609 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002610 (const Py_UCS1 *)data,
2611 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002612 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002614 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002615 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002616 if (!result)
2617 return PyErr_NoMemory();
2618 if (skind == PyUnicode_2BYTE_KIND) {
2619 _PyUnicode_CONVERT_BYTES(
2620 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002621 (const Py_UCS2 *)data,
2622 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002623 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002624 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002625 else {
2626 assert(skind == PyUnicode_1BYTE_KIND);
2627 _PyUnicode_CONVERT_BYTES(
2628 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002629 (const Py_UCS1 *)data,
2630 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002631 result);
2632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002634 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002635 Py_UNREACHABLE();
2636 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638}
2639
2640static Py_UCS4*
2641as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2642 int copy_null)
2643{
2644 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002645 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002646 Py_ssize_t len, targetlen;
2647 if (PyUnicode_READY(string) == -1)
2648 return NULL;
2649 kind = PyUnicode_KIND(string);
2650 data = PyUnicode_DATA(string);
2651 len = PyUnicode_GET_LENGTH(string);
2652 targetlen = len;
2653 if (copy_null)
2654 targetlen++;
2655 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002656 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 if (!target) {
2658 PyErr_NoMemory();
2659 return NULL;
2660 }
2661 }
2662 else {
2663 if (targetsize < targetlen) {
2664 PyErr_Format(PyExc_SystemError,
2665 "string is longer than the buffer");
2666 if (copy_null && 0 < targetsize)
2667 target[0] = 0;
2668 return NULL;
2669 }
2670 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002671 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002672 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002673 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002675 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002676 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002677 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2678 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002679 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002680 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002681 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002682 else {
2683 Py_UNREACHABLE();
2684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 if (copy_null)
2686 target[len] = 0;
2687 return target;
2688}
2689
2690Py_UCS4*
2691PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2692 int copy_null)
2693{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002694 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 PyErr_BadInternalCall();
2696 return NULL;
2697 }
2698 return as_ucs4(string, target, targetsize, copy_null);
2699}
2700
2701Py_UCS4*
2702PyUnicode_AsUCS4Copy(PyObject *string)
2703{
2704 return as_ucs4(string, NULL, 0, 1);
2705}
2706
Victor Stinner15a11362012-10-06 23:48:20 +02002707/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002708 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2709 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2710#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002711
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002712static int
2713unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2714 Py_ssize_t width, Py_ssize_t precision)
2715{
2716 Py_ssize_t length, fill, arglen;
2717 Py_UCS4 maxchar;
2718
2719 if (PyUnicode_READY(str) == -1)
2720 return -1;
2721
2722 length = PyUnicode_GET_LENGTH(str);
2723 if ((precision == -1 || precision >= length)
2724 && width <= length)
2725 return _PyUnicodeWriter_WriteStr(writer, str);
2726
2727 if (precision != -1)
2728 length = Py_MIN(precision, length);
2729
2730 arglen = Py_MAX(length, width);
2731 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2732 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2733 else
2734 maxchar = writer->maxchar;
2735
2736 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2737 return -1;
2738
2739 if (width > length) {
2740 fill = width - length;
2741 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2742 return -1;
2743 writer->pos += fill;
2744 }
2745
2746 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2747 str, 0, length);
2748 writer->pos += length;
2749 return 0;
2750}
2751
2752static int
Victor Stinner998b8062018-09-12 00:23:25 +02002753unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002754 Py_ssize_t width, Py_ssize_t precision)
2755{
2756 /* UTF-8 */
2757 Py_ssize_t length;
2758 PyObject *unicode;
2759 int res;
2760
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002761 if (precision == -1) {
2762 length = strlen(str);
2763 }
2764 else {
2765 length = 0;
2766 while (length < precision && str[length]) {
2767 length++;
2768 }
2769 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002770 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2771 if (unicode == NULL)
2772 return -1;
2773
2774 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2775 Py_DECREF(unicode);
2776 return res;
2777}
2778
Victor Stinner96865452011-03-01 23:44:09 +00002779static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002780unicode_fromformat_arg(_PyUnicodeWriter *writer,
2781 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002782{
Victor Stinnere215d962012-10-06 23:03:36 +02002783 const char *p;
2784 Py_ssize_t len;
2785 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 Py_ssize_t width;
2787 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002788 int longflag;
2789 int longlongflag;
2790 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002791 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002792
2793 p = f;
2794 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002795 zeropad = 0;
2796 if (*f == '0') {
2797 zeropad = 1;
2798 f++;
2799 }
Victor Stinner96865452011-03-01 23:44:09 +00002800
2801 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002802 width = -1;
2803 if (Py_ISDIGIT((unsigned)*f)) {
2804 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002805 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002806 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002807 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002808 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002809 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002810 return NULL;
2811 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002812 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002813 f++;
2814 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002815 }
2816 precision = -1;
2817 if (*f == '.') {
2818 f++;
2819 if (Py_ISDIGIT((unsigned)*f)) {
2820 precision = (*f - '0');
2821 f++;
2822 while (Py_ISDIGIT((unsigned)*f)) {
2823 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2824 PyErr_SetString(PyExc_ValueError,
2825 "precision too big");
2826 return NULL;
2827 }
2828 precision = (precision * 10) + (*f - '0');
2829 f++;
2830 }
2831 }
Victor Stinner96865452011-03-01 23:44:09 +00002832 if (*f == '%') {
2833 /* "%.3%s" => f points to "3" */
2834 f--;
2835 }
2836 }
2837 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002838 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002839 f--;
2840 }
Victor Stinner96865452011-03-01 23:44:09 +00002841
2842 /* Handle %ld, %lu, %lld and %llu. */
2843 longflag = 0;
2844 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002845 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002846 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002847 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002848 longflag = 1;
2849 ++f;
2850 }
Victor Stinner96865452011-03-01 23:44:09 +00002851 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002852 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002853 longlongflag = 1;
2854 f += 2;
2855 }
Victor Stinner96865452011-03-01 23:44:09 +00002856 }
2857 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002858 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002859 size_tflag = 1;
2860 ++f;
2861 }
Victor Stinnere215d962012-10-06 23:03:36 +02002862
2863 if (f[1] == '\0')
2864 writer->overallocate = 0;
2865
2866 switch (*f) {
2867 case 'c':
2868 {
2869 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002870 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002871 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002872 "character argument not in range(0x110000)");
2873 return NULL;
2874 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002875 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002876 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002877 break;
2878 }
2879
2880 case 'i':
2881 case 'd':
2882 case 'u':
2883 case 'x':
2884 {
2885 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002886 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002887 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002888
2889 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002890 if (longflag) {
2891 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2892 }
2893 else if (longlongflag) {
2894 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2895 }
2896 else if (size_tflag) {
2897 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2898 }
2899 else {
2900 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2901 }
Victor Stinnere215d962012-10-06 23:03:36 +02002902 }
2903 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002904 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002905 }
2906 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002907 if (longflag) {
2908 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2909 }
2910 else if (longlongflag) {
2911 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2912 }
2913 else if (size_tflag) {
2914 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2915 }
2916 else {
2917 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2918 }
Victor Stinnere215d962012-10-06 23:03:36 +02002919 }
2920 assert(len >= 0);
2921
Victor Stinnere215d962012-10-06 23:03:36 +02002922 if (precision < len)
2923 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002924
2925 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002926 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2927 return NULL;
2928
Victor Stinnere215d962012-10-06 23:03:36 +02002929 if (width > precision) {
2930 Py_UCS4 fillchar;
2931 fill = width - precision;
2932 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002933 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2934 return NULL;
2935 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002936 }
Victor Stinner15a11362012-10-06 23:48:20 +02002937 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002938 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002939 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2940 return NULL;
2941 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002942 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002943
Victor Stinner4a587072013-11-19 12:54:53 +01002944 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2945 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002946 break;
2947 }
2948
2949 case 'p':
2950 {
2951 char number[MAX_LONG_LONG_CHARS];
2952
2953 len = sprintf(number, "%p", va_arg(*vargs, void*));
2954 assert(len >= 0);
2955
2956 /* %p is ill-defined: ensure leading 0x. */
2957 if (number[1] == 'X')
2958 number[1] = 'x';
2959 else if (number[1] != 'x') {
2960 memmove(number + 2, number,
2961 strlen(number) + 1);
2962 number[0] = '0';
2963 number[1] = 'x';
2964 len += 2;
2965 }
2966
Victor Stinner4a587072013-11-19 12:54:53 +01002967 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002968 return NULL;
2969 break;
2970 }
2971
2972 case 's':
2973 {
2974 /* UTF-8 */
2975 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002976 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002977 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002978 break;
2979 }
2980
2981 case 'U':
2982 {
2983 PyObject *obj = va_arg(*vargs, PyObject *);
2984 assert(obj && _PyUnicode_CHECK(obj));
2985
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002986 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002987 return NULL;
2988 break;
2989 }
2990
2991 case 'V':
2992 {
2993 PyObject *obj = va_arg(*vargs, PyObject *);
2994 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002995 if (obj) {
2996 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002997 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002998 return NULL;
2999 }
3000 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003001 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02003002 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003003 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003004 }
3005 break;
3006 }
3007
3008 case 'S':
3009 {
3010 PyObject *obj = va_arg(*vargs, PyObject *);
3011 PyObject *str;
3012 assert(obj);
3013 str = PyObject_Str(obj);
3014 if (!str)
3015 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003016 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003017 Py_DECREF(str);
3018 return NULL;
3019 }
3020 Py_DECREF(str);
3021 break;
3022 }
3023
3024 case 'R':
3025 {
3026 PyObject *obj = va_arg(*vargs, PyObject *);
3027 PyObject *repr;
3028 assert(obj);
3029 repr = PyObject_Repr(obj);
3030 if (!repr)
3031 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003032 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003033 Py_DECREF(repr);
3034 return NULL;
3035 }
3036 Py_DECREF(repr);
3037 break;
3038 }
3039
3040 case 'A':
3041 {
3042 PyObject *obj = va_arg(*vargs, PyObject *);
3043 PyObject *ascii;
3044 assert(obj);
3045 ascii = PyObject_ASCII(obj);
3046 if (!ascii)
3047 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003048 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003049 Py_DECREF(ascii);
3050 return NULL;
3051 }
3052 Py_DECREF(ascii);
3053 break;
3054 }
3055
3056 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003057 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003058 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003059 break;
3060
3061 default:
3062 /* if we stumble upon an unknown formatting code, copy the rest
3063 of the format string to the output string. (we cannot just
3064 skip the code, since there's no way to know what's in the
3065 argument list) */
3066 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003067 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003068 return NULL;
3069 f = p+len;
3070 return f;
3071 }
3072
3073 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003074 return f;
3075}
3076
Walter Dörwaldd2034312007-05-18 16:29:38 +00003077PyObject *
3078PyUnicode_FromFormatV(const char *format, va_list vargs)
3079{
Victor Stinnere215d962012-10-06 23:03:36 +02003080 va_list vargs2;
3081 const char *f;
3082 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003083
Victor Stinner8f674cc2013-04-17 23:02:17 +02003084 _PyUnicodeWriter_Init(&writer);
3085 writer.min_length = strlen(format) + 100;
3086 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003087
Benjamin Peterson0c212142016-09-20 20:39:33 -07003088 // Copy varags to be able to pass a reference to a subfunction.
3089 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003090
3091 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003092 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003093 f = unicode_fromformat_arg(&writer, f, &vargs2);
3094 if (f == NULL)
3095 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003097 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003098 const char *p;
3099 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003100
Victor Stinnere215d962012-10-06 23:03:36 +02003101 p = f;
3102 do
3103 {
3104 if ((unsigned char)*p > 127) {
3105 PyErr_Format(PyExc_ValueError,
3106 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3107 "string, got a non-ASCII byte: 0x%02x",
3108 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003109 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003110 }
3111 p++;
3112 }
3113 while (*p != '\0' && *p != '%');
3114 len = p - f;
3115
3116 if (*p == '\0')
3117 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003118
3119 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003120 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003121
3122 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003123 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003124 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003125 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003126 return _PyUnicodeWriter_Finish(&writer);
3127
3128 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003129 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003130 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003131 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003132}
3133
Walter Dörwaldd2034312007-05-18 16:29:38 +00003134PyObject *
3135PyUnicode_FromFormat(const char *format, ...)
3136{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003137 PyObject* ret;
3138 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003139
3140#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003141 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003142#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003143 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003144#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003145 ret = PyUnicode_FromFormatV(format, vargs);
3146 va_end(vargs);
3147 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003148}
3149
Serhiy Storchakac46db922018-10-23 22:58:24 +03003150static Py_ssize_t
3151unicode_get_widechar_size(PyObject *unicode)
3152{
3153 Py_ssize_t res;
3154
3155 assert(unicode != NULL);
3156 assert(_PyUnicode_CHECK(unicode));
3157
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003158#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchakac46db922018-10-23 22:58:24 +03003159 if (_PyUnicode_WSTR(unicode) != NULL) {
3160 return PyUnicode_WSTR_LENGTH(unicode);
3161 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003162#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003163 assert(PyUnicode_IS_READY(unicode));
3164
3165 res = _PyUnicode_LENGTH(unicode);
3166#if SIZEOF_WCHAR_T == 2
3167 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3168 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3169 const Py_UCS4 *end = s + res;
3170 for (; s < end; ++s) {
3171 if (*s > 0xFFFF) {
3172 ++res;
3173 }
3174 }
3175 }
3176#endif
3177 return res;
3178}
3179
3180static void
3181unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3182{
Serhiy Storchakac46db922018-10-23 22:58:24 +03003183 assert(unicode != NULL);
3184 assert(_PyUnicode_CHECK(unicode));
3185
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003186#if USE_UNICODE_WCHAR_CACHE
3187 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003188 if (wstr != NULL) {
3189 memcpy(w, wstr, size * sizeof(wchar_t));
3190 return;
3191 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003192#else /* USE_UNICODE_WCHAR_CACHE */
3193 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3194 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3195 return;
3196 }
3197#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003198 assert(PyUnicode_IS_READY(unicode));
3199
3200 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3201 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3202 for (; size--; ++s, ++w) {
3203 *w = *s;
3204 }
3205 }
3206 else {
3207#if SIZEOF_WCHAR_T == 4
3208 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3209 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3210 for (; size--; ++s, ++w) {
3211 *w = *s;
3212 }
3213#else
3214 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3215 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3216 for (; size--; ++s, ++w) {
3217 Py_UCS4 ch = *s;
3218 if (ch > 0xFFFF) {
3219 assert(ch <= MAX_UNICODE);
3220 /* encode surrogate pair in this case */
3221 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3222 if (!size--)
3223 break;
3224 *w = Py_UNICODE_LOW_SURROGATE(ch);
3225 }
3226 else {
3227 *w = ch;
3228 }
3229 }
3230#endif
3231 }
3232}
3233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003234#ifdef HAVE_WCHAR_H
3235
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003236/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003237
Victor Stinnerd88d9832011-09-06 02:00:05 +02003238 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003239 character) required to convert the unicode object. Ignore size argument.
3240
Victor Stinnerd88d9832011-09-06 02:00:05 +02003241 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003242 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003243 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003244Py_ssize_t
3245PyUnicode_AsWideChar(PyObject *unicode,
3246 wchar_t *w,
3247 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003248{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003249 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003250
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003251 if (unicode == NULL) {
3252 PyErr_BadInternalCall();
3253 return -1;
3254 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003255 if (!PyUnicode_Check(unicode)) {
3256 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003257 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003258 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003259
3260 res = unicode_get_widechar_size(unicode);
3261 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003262 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003263 }
3264
3265 if (size > res) {
3266 size = res + 1;
3267 }
3268 else {
3269 res = size;
3270 }
3271 unicode_copy_as_widechar(unicode, w, size);
3272 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003273}
3274
Victor Stinner137c34c2010-09-29 10:25:54 +00003275wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003276PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003277 Py_ssize_t *size)
3278{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003279 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003280 Py_ssize_t buflen;
3281
3282 if (unicode == NULL) {
3283 PyErr_BadInternalCall();
3284 return NULL;
3285 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003286 if (!PyUnicode_Check(unicode)) {
3287 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003288 return NULL;
3289 }
3290
Serhiy Storchakac46db922018-10-23 22:58:24 +03003291 buflen = unicode_get_widechar_size(unicode);
3292 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003293 if (buffer == NULL) {
3294 PyErr_NoMemory();
3295 return NULL;
3296 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003297 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3298 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003299 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003300 }
3301 else if (wcslen(buffer) != (size_t)buflen) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01003302 PyMem_Free(buffer);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003303 PyErr_SetString(PyExc_ValueError,
3304 "embedded null character");
3305 return NULL;
3306 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003307 return buffer;
3308}
3309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003310#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003312int
3313_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3314{
3315 wchar_t **p = (wchar_t **)ptr;
3316 if (obj == NULL) {
3317#if !USE_UNICODE_WCHAR_CACHE
3318 PyMem_Free(*p);
3319#endif /* USE_UNICODE_WCHAR_CACHE */
3320 *p = NULL;
3321 return 1;
3322 }
3323 if (PyUnicode_Check(obj)) {
3324#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003325 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3326 if (*p == NULL) {
3327 return 0;
3328 }
3329 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003330#else /* USE_UNICODE_WCHAR_CACHE */
3331 *p = PyUnicode_AsWideCharString(obj, NULL);
3332 if (*p == NULL) {
3333 return 0;
3334 }
3335 return Py_CLEANUP_SUPPORTED;
3336#endif /* USE_UNICODE_WCHAR_CACHE */
3337 }
3338 PyErr_Format(PyExc_TypeError,
3339 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003340 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003341 return 0;
3342}
3343
3344int
3345_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3346{
3347 wchar_t **p = (wchar_t **)ptr;
3348 if (obj == NULL) {
3349#if !USE_UNICODE_WCHAR_CACHE
3350 PyMem_Free(*p);
3351#endif /* USE_UNICODE_WCHAR_CACHE */
3352 *p = NULL;
3353 return 1;
3354 }
3355 if (obj == Py_None) {
3356 *p = NULL;
3357 return 1;
3358 }
3359 if (PyUnicode_Check(obj)) {
3360#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003361 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3362 if (*p == NULL) {
3363 return 0;
3364 }
3365 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003366#else /* USE_UNICODE_WCHAR_CACHE */
3367 *p = PyUnicode_AsWideCharString(obj, NULL);
3368 if (*p == NULL) {
3369 return 0;
3370 }
3371 return Py_CLEANUP_SUPPORTED;
3372#endif /* USE_UNICODE_WCHAR_CACHE */
3373 }
3374 PyErr_Format(PyExc_TypeError,
3375 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003376 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003377 return 0;
3378}
3379
Alexander Belopolsky40018472011-02-26 01:02:56 +00003380PyObject *
3381PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003382{
Victor Stinner8faf8212011-12-08 22:14:11 +01003383 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003384 PyErr_SetString(PyExc_ValueError,
3385 "chr() arg not in range(0x110000)");
3386 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003387 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003388
Victor Stinner985a82a2014-01-03 12:53:47 +01003389 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003390}
3391
Alexander Belopolsky40018472011-02-26 01:02:56 +00003392PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003393PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003395 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003397 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003398 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003399 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003400 Py_INCREF(obj);
3401 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003402 }
3403 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003404 /* For a Unicode subtype that's not a Unicode object,
3405 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003406 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003407 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003408 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003409 "Can't convert '%.100s' object to str implicitly",
3410 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003411 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003412}
3413
Alexander Belopolsky40018472011-02-26 01:02:56 +00003414PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003415PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003416 const char *encoding,
3417 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003418{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003419 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003420 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003421
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003423 PyErr_BadInternalCall();
3424 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003426
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003427 /* Decoding bytes objects is the most common case and should be fast */
3428 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003429 if (PyBytes_GET_SIZE(obj) == 0) {
3430 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3431 return NULL;
3432 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003433 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003434 }
3435 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003436 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3437 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003438 }
3439
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003440 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003441 PyErr_SetString(PyExc_TypeError,
3442 "decoding str is not supported");
3443 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003444 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003445
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003446 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3447 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3448 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003449 "decoding to str: need a bytes-like object, %.80s found",
3450 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003451 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003452 }
Tim Petersced69f82003-09-16 20:30:58 +00003453
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003454 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003455 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003456 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3457 return NULL;
3458 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003459 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003461
Serhiy Storchaka05997252013-01-26 12:14:02 +02003462 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003463 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003464 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465}
3466
Victor Stinnerebe17e02016-10-12 13:57:45 +02003467/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3468 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3469 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003470int
3471_Py_normalize_encoding(const char *encoding,
3472 char *lower,
3473 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003475 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003476 char *l;
3477 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003478 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479
Victor Stinner942889a2016-09-05 15:40:10 -07003480 assert(encoding != NULL);
3481
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003482 e = encoding;
3483 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003484 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003485 punct = 0;
3486 while (1) {
3487 char c = *e;
3488 if (c == 0) {
3489 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003490 }
Victor Stinner942889a2016-09-05 15:40:10 -07003491
3492 if (Py_ISALNUM(c) || c == '.') {
3493 if (punct && l != lower) {
3494 if (l == l_end) {
3495 return 0;
3496 }
3497 *l++ = '_';
3498 }
3499 punct = 0;
3500
3501 if (l == l_end) {
3502 return 0;
3503 }
3504 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003505 }
3506 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003507 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003508 }
Victor Stinner942889a2016-09-05 15:40:10 -07003509
3510 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003511 }
3512 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003513 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003514}
3515
Alexander Belopolsky40018472011-02-26 01:02:56 +00003516PyObject *
3517PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003518 Py_ssize_t size,
3519 const char *encoding,
3520 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003521{
3522 PyObject *buffer = NULL, *unicode;
3523 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003524 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3525
Victor Stinner22eb6892019-06-26 00:51:05 +02003526 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3527 return NULL;
3528 }
3529
Victor Stinnered076ed2019-06-26 01:49:32 +02003530 if (size == 0) {
3531 _Py_RETURN_UNICODE_EMPTY();
3532 }
3533
Victor Stinner942889a2016-09-05 15:40:10 -07003534 if (encoding == NULL) {
3535 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3536 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003537
Fred Drakee4315f52000-05-09 19:53:39 +00003538 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003539 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3540 char *lower = buflower;
3541
3542 /* Fast paths */
3543 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3544 lower += 3;
3545 if (*lower == '_') {
3546 /* Match "utf8" and "utf_8" */
3547 lower++;
3548 }
3549
3550 if (lower[0] == '8' && lower[1] == 0) {
3551 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3552 }
3553 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3554 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3555 }
3556 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3557 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3558 }
3559 }
3560 else {
3561 if (strcmp(lower, "ascii") == 0
3562 || strcmp(lower, "us_ascii") == 0) {
3563 return PyUnicode_DecodeASCII(s, size, errors);
3564 }
Steve Dowercc16be82016-09-08 10:35:16 -07003565 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003566 else if (strcmp(lower, "mbcs") == 0) {
3567 return PyUnicode_DecodeMBCS(s, size, errors);
3568 }
3569 #endif
3570 else if (strcmp(lower, "latin1") == 0
3571 || strcmp(lower, "latin_1") == 0
3572 || strcmp(lower, "iso_8859_1") == 0
3573 || strcmp(lower, "iso8859_1") == 0) {
3574 return PyUnicode_DecodeLatin1(s, size, errors);
3575 }
3576 }
Victor Stinner37296e82010-06-10 13:36:23 +00003577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578
3579 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003580 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003581 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003582 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003583 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584 if (buffer == NULL)
3585 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003586 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 if (unicode == NULL)
3588 goto onError;
3589 if (!PyUnicode_Check(unicode)) {
3590 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003591 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003592 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003593 encoding,
3594 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595 Py_DECREF(unicode);
3596 goto onError;
3597 }
3598 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003599 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003600
Benjamin Peterson29060642009-01-31 22:14:21 +00003601 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602 Py_XDECREF(buffer);
3603 return NULL;
3604}
3605
Alexander Belopolsky40018472011-02-26 01:02:56 +00003606PyObject *
3607PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003608 const char *encoding,
3609 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003610{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003611 if (!PyUnicode_Check(unicode)) {
3612 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003613 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003614 }
3615
Serhiy Storchaka00939072016-10-27 21:05:49 +03003616 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3617 "PyUnicode_AsDecodedObject() is deprecated; "
3618 "use PyCodec_Decode() to decode from str", 1) < 0)
3619 return NULL;
3620
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003621 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003623
3624 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003625 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003626}
3627
Alexander Belopolsky40018472011-02-26 01:02:56 +00003628PyObject *
3629PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003630 const char *encoding,
3631 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003632{
3633 PyObject *v;
3634
3635 if (!PyUnicode_Check(unicode)) {
3636 PyErr_BadArgument();
3637 goto onError;
3638 }
3639
Serhiy Storchaka00939072016-10-27 21:05:49 +03003640 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3641 "PyUnicode_AsDecodedUnicode() is deprecated; "
3642 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3643 return NULL;
3644
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003645 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003646 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003647
3648 /* Decode via the codec registry */
3649 v = PyCodec_Decode(unicode, encoding, errors);
3650 if (v == NULL)
3651 goto onError;
3652 if (!PyUnicode_Check(v)) {
3653 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003654 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003655 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003656 encoding,
3657 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003658 Py_DECREF(v);
3659 goto onError;
3660 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003661 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003662
Benjamin Peterson29060642009-01-31 22:14:21 +00003663 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003664 return NULL;
3665}
3666
Alexander Belopolsky40018472011-02-26 01:02:56 +00003667PyObject *
3668PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003669 Py_ssize_t size,
3670 const char *encoding,
3671 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672{
3673 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003674
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003675 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3679 Py_DECREF(unicode);
3680 return v;
3681}
3682
Alexander Belopolsky40018472011-02-26 01:02:56 +00003683PyObject *
3684PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003685 const char *encoding,
3686 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003687{
3688 PyObject *v;
3689
3690 if (!PyUnicode_Check(unicode)) {
3691 PyErr_BadArgument();
3692 goto onError;
3693 }
3694
Serhiy Storchaka00939072016-10-27 21:05:49 +03003695 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3696 "PyUnicode_AsEncodedObject() is deprecated; "
3697 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3698 "or PyCodec_Encode() for generic encoding", 1) < 0)
3699 return NULL;
3700
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003701 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003703
3704 /* Encode via the codec registry */
3705 v = PyCodec_Encode(unicode, encoding, errors);
3706 if (v == NULL)
3707 goto onError;
3708 return v;
3709
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003711 return NULL;
3712}
3713
Victor Stinner1b579672011-12-17 05:47:23 +01003714
Victor Stinner2cba6b82018-01-10 22:46:15 +01003715static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003716unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003717 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003718{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003719 Py_ssize_t wlen;
3720 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3721 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003722 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003723 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003724
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003725 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003726 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003727 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003728 return NULL;
3729 }
3730
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003731 char *str;
3732 size_t error_pos;
3733 const char *reason;
3734 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003735 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003736 PyMem_Free(wstr);
3737
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003738 if (res != 0) {
3739 if (res == -2) {
3740 PyObject *exc;
3741 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3742 "locale", unicode,
3743 (Py_ssize_t)error_pos,
3744 (Py_ssize_t)(error_pos+1),
3745 reason);
3746 if (exc != NULL) {
3747 PyCodec_StrictErrors(exc);
3748 Py_DECREF(exc);
3749 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003750 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003751 else if (res == -3) {
3752 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3753 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003754 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003755 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003756 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003757 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003758 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003759
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003760 PyObject *bytes = PyBytes_FromString(str);
3761 PyMem_RawFree(str);
3762 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003763}
3764
Victor Stinnerad158722010-10-27 00:25:46 +00003765PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003766PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3767{
Victor Stinner709d23d2019-05-02 14:56:30 -04003768 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3769 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003770}
3771
3772PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003773PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003774{
Victor Stinner81a7be32020-04-14 15:14:01 +02003775 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003776 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3777 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003778 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003779 fs_codec->error_handler,
3780 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003781 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003782#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003783 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003784 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003785 fs_codec->encoding,
3786 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003787 }
Victor Stinnerad158722010-10-27 00:25:46 +00003788#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003789 else {
3790 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3791 machinery is not ready and so cannot be used:
3792 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003793 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3794 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003795 assert(filesystem_errors != NULL);
3796 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3797 assert(errors != _Py_ERROR_UNKNOWN);
3798#ifdef _Py_FORCE_UTF8_FS_ENCODING
3799 return unicode_encode_utf8(unicode, errors, NULL);
3800#else
3801 return unicode_encode_locale(unicode, errors, 0);
3802#endif
3803 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003804}
3805
Alexander Belopolsky40018472011-02-26 01:02:56 +00003806PyObject *
3807PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003808 const char *encoding,
3809 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810{
3811 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003812 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 if (!PyUnicode_Check(unicode)) {
3815 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 }
Fred Drakee4315f52000-05-09 19:53:39 +00003818
Victor Stinner22eb6892019-06-26 00:51:05 +02003819 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3820 return NULL;
3821 }
3822
Victor Stinner942889a2016-09-05 15:40:10 -07003823 if (encoding == NULL) {
3824 return _PyUnicode_AsUTF8String(unicode, errors);
3825 }
3826
Fred Drakee4315f52000-05-09 19:53:39 +00003827 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003828 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3829 char *lower = buflower;
3830
3831 /* Fast paths */
3832 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3833 lower += 3;
3834 if (*lower == '_') {
3835 /* Match "utf8" and "utf_8" */
3836 lower++;
3837 }
3838
3839 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003841 }
3842 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3843 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3844 }
3845 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3846 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3847 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003848 }
Victor Stinner942889a2016-09-05 15:40:10 -07003849 else {
3850 if (strcmp(lower, "ascii") == 0
3851 || strcmp(lower, "us_ascii") == 0) {
3852 return _PyUnicode_AsASCIIString(unicode, errors);
3853 }
Steve Dowercc16be82016-09-08 10:35:16 -07003854#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003855 else if (strcmp(lower, "mbcs") == 0) {
3856 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3857 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003858#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003859 else if (strcmp(lower, "latin1") == 0 ||
3860 strcmp(lower, "latin_1") == 0 ||
3861 strcmp(lower, "iso_8859_1") == 0 ||
3862 strcmp(lower, "iso8859_1") == 0) {
3863 return _PyUnicode_AsLatin1String(unicode, errors);
3864 }
3865 }
Victor Stinner37296e82010-06-10 13:36:23 +00003866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867
3868 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003869 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003871 return NULL;
3872
3873 /* The normal path */
3874 if (PyBytes_Check(v))
3875 return v;
3876
3877 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003878 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003879 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003880 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003881
3882 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003883 "encoder %s returned bytearray instead of bytes; "
3884 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003885 encoding);
3886 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003887 Py_DECREF(v);
3888 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003889 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003890
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003891 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3892 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003893 Py_DECREF(v);
3894 return b;
3895 }
3896
3897 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003898 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003899 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003900 encoding,
3901 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003902 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003903 return NULL;
3904}
3905
Alexander Belopolsky40018472011-02-26 01:02:56 +00003906PyObject *
3907PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003908 const char *encoding,
3909 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003910{
3911 PyObject *v;
3912
3913 if (!PyUnicode_Check(unicode)) {
3914 PyErr_BadArgument();
3915 goto onError;
3916 }
3917
Serhiy Storchaka00939072016-10-27 21:05:49 +03003918 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3919 "PyUnicode_AsEncodedUnicode() is deprecated; "
3920 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3921 return NULL;
3922
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003923 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003925
3926 /* Encode via the codec registry */
3927 v = PyCodec_Encode(unicode, encoding, errors);
3928 if (v == NULL)
3929 goto onError;
3930 if (!PyUnicode_Check(v)) {
3931 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003932 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003933 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003934 encoding,
3935 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003936 Py_DECREF(v);
3937 goto onError;
3938 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003940
Benjamin Peterson29060642009-01-31 22:14:21 +00003941 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 return NULL;
3943}
3944
Victor Stinner2cba6b82018-01-10 22:46:15 +01003945static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003946unicode_decode_locale(const char *str, Py_ssize_t len,
3947 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003948{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003949 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3950 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003951 return NULL;
3952 }
3953
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003954 wchar_t *wstr;
3955 size_t wlen;
3956 const char *reason;
3957 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003958 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003959 if (res != 0) {
3960 if (res == -2) {
3961 PyObject *exc;
3962 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3963 "locale", str, len,
3964 (Py_ssize_t)wlen,
3965 (Py_ssize_t)(wlen + 1),
3966 reason);
3967 if (exc != NULL) {
3968 PyCodec_StrictErrors(exc);
3969 Py_DECREF(exc);
3970 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003971 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003972 else if (res == -3) {
3973 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3974 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003975 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003976 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003977 }
Victor Stinner2f197072011-12-17 07:08:30 +01003978 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003979 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003980
3981 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3982 PyMem_RawFree(wstr);
3983 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003984}
3985
3986PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003987PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3988 const char *errors)
3989{
Victor Stinner709d23d2019-05-02 14:56:30 -04003990 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3991 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003992}
3993
3994PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003995PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003996{
3997 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003998 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3999 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004000}
4001
4002
4003PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00004004PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004005 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004006 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4007}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004008
Christian Heimes5894ba72007-11-04 11:43:14 +00004009PyObject*
4010PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4011{
Victor Stinner81a7be32020-04-14 15:14:01 +02004012 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004013 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4014 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004015 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004016 fs_codec->error_handler,
4017 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004018 NULL);
4019 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004020#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004021 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004022 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004023 fs_codec->encoding,
4024 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004025 }
Victor Stinnerad158722010-10-27 00:25:46 +00004026#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004027 else {
4028 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4029 machinery is not ready and so cannot be used:
4030 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004031 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4032 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004033 assert(filesystem_errors != NULL);
4034 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4035 assert(errors != _Py_ERROR_UNKNOWN);
4036#ifdef _Py_FORCE_UTF8_FS_ENCODING
4037 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4038#else
4039 return unicode_decode_locale(s, size, errors, 0);
4040#endif
4041 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004042}
4043
Martin v. Löwis011e8422009-05-05 04:43:17 +00004044
4045int
4046PyUnicode_FSConverter(PyObject* arg, void* addr)
4047{
Brett Cannonec6ce872016-09-06 15:50:29 -07004048 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004049 PyObject *output = NULL;
4050 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004051 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004052 if (arg == NULL) {
4053 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004054 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004055 return 1;
4056 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004057 path = PyOS_FSPath(arg);
4058 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004059 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004060 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004061 if (PyBytes_Check(path)) {
4062 output = path;
4063 }
4064 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4065 output = PyUnicode_EncodeFSDefault(path);
4066 Py_DECREF(path);
4067 if (!output) {
4068 return 0;
4069 }
4070 assert(PyBytes_Check(output));
4071 }
4072
Victor Stinner0ea2a462010-04-30 00:22:08 +00004073 size = PyBytes_GET_SIZE(output);
4074 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004075 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004076 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004077 Py_DECREF(output);
4078 return 0;
4079 }
4080 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004081 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004082}
4083
4084
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004085int
4086PyUnicode_FSDecoder(PyObject* arg, void* addr)
4087{
Brett Cannona5711202016-09-06 19:36:01 -07004088 int is_buffer = 0;
4089 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004090 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004091 if (arg == NULL) {
4092 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004093 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004094 return 1;
4095 }
Brett Cannona5711202016-09-06 19:36:01 -07004096
4097 is_buffer = PyObject_CheckBuffer(arg);
4098 if (!is_buffer) {
4099 path = PyOS_FSPath(arg);
4100 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004101 return 0;
4102 }
Brett Cannona5711202016-09-06 19:36:01 -07004103 }
4104 else {
4105 path = arg;
4106 Py_INCREF(arg);
4107 }
4108
4109 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004110 output = path;
4111 }
4112 else if (PyBytes_Check(path) || is_buffer) {
4113 PyObject *path_bytes = NULL;
4114
4115 if (!PyBytes_Check(path) &&
4116 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004117 "path should be string, bytes, or os.PathLike, not %.200s",
4118 Py_TYPE(arg)->tp_name)) {
4119 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004120 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004121 }
4122 path_bytes = PyBytes_FromObject(path);
4123 Py_DECREF(path);
4124 if (!path_bytes) {
4125 return 0;
4126 }
4127 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4128 PyBytes_GET_SIZE(path_bytes));
4129 Py_DECREF(path_bytes);
4130 if (!output) {
4131 return 0;
4132 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004133 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004134 else {
4135 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004136 "path should be string, bytes, or os.PathLike, not %.200s",
4137 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004138 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004139 return 0;
4140 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004141 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004142 Py_DECREF(output);
4143 return 0;
4144 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004145 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004146 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004147 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004148 Py_DECREF(output);
4149 return 0;
4150 }
4151 *(PyObject**)addr = output;
4152 return Py_CLEANUP_SUPPORTED;
4153}
4154
4155
Inada Naoki02a4d572020-02-27 13:48:59 +09004156static int unicode_fill_utf8(PyObject *unicode);
4157
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004158const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004160{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004161 if (!PyUnicode_Check(unicode)) {
4162 PyErr_BadArgument();
4163 return NULL;
4164 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004165 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004166 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004168 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004169 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004170 return NULL;
4171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172 }
4173
4174 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004175 *psize = PyUnicode_UTF8_LENGTH(unicode);
4176 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004177}
4178
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004179const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004180PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4183}
4184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185Py_UNICODE *
4186PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004188 if (!PyUnicode_Check(unicode)) {
4189 PyErr_BadArgument();
4190 return NULL;
4191 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004192 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4193 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004194 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004195 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004196 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197
Serhiy Storchakac46db922018-10-23 22:58:24 +03004198 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4199 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4200 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004201 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004202 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01004203 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
Serhiy Storchakac46db922018-10-23 22:58:24 +03004204 if (w == NULL) {
4205 PyErr_NoMemory();
4206 return NULL;
4207 }
4208 unicode_copy_as_widechar(unicode, w, wlen + 1);
4209 _PyUnicode_WSTR(unicode) = w;
4210 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4211 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004212 }
4213 }
4214 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004215 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004216 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004217}
4218
Inada Naoki2c4928d2020-06-17 20:09:44 +09004219/* Deprecated APIs */
4220
4221_Py_COMP_DIAG_PUSH
4222_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4223
Alexander Belopolsky40018472011-02-26 01:02:56 +00004224Py_UNICODE *
4225PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004227 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228}
4229
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004230const Py_UNICODE *
4231_PyUnicode_AsUnicode(PyObject *unicode)
4232{
4233 Py_ssize_t size;
4234 const Py_UNICODE *wstr;
4235
4236 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4237 if (wstr && wcslen(wstr) != (size_t)size) {
4238 PyErr_SetString(PyExc_ValueError, "embedded null character");
4239 return NULL;
4240 }
4241 return wstr;
4242}
4243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004244
Alexander Belopolsky40018472011-02-26 01:02:56 +00004245Py_ssize_t
4246PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247{
4248 if (!PyUnicode_Check(unicode)) {
4249 PyErr_BadArgument();
4250 goto onError;
4251 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004252 if (_PyUnicode_WSTR(unicode) == NULL) {
4253 if (PyUnicode_AsUnicode(unicode) == NULL)
4254 goto onError;
4255 }
4256 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 return -1;
4260}
4261
Inada Naoki2c4928d2020-06-17 20:09:44 +09004262_Py_COMP_DIAG_POP
4263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004264Py_ssize_t
4265PyUnicode_GetLength(PyObject *unicode)
4266{
Victor Stinner07621332012-06-16 04:53:46 +02004267 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 PyErr_BadArgument();
4269 return -1;
4270 }
Victor Stinner07621332012-06-16 04:53:46 +02004271 if (PyUnicode_READY(unicode) == -1)
4272 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004273 return PyUnicode_GET_LENGTH(unicode);
4274}
4275
4276Py_UCS4
4277PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4278{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004279 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004280 int kind;
4281
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004282 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004283 PyErr_BadArgument();
4284 return (Py_UCS4)-1;
4285 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004286 if (PyUnicode_READY(unicode) == -1) {
4287 return (Py_UCS4)-1;
4288 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004289 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004290 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291 return (Py_UCS4)-1;
4292 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004293 data = PyUnicode_DATA(unicode);
4294 kind = PyUnicode_KIND(unicode);
4295 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004296}
4297
4298int
4299PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4300{
4301 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004302 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004303 return -1;
4304 }
Victor Stinner488fa492011-12-12 00:01:39 +01004305 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004306 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004307 PyErr_SetString(PyExc_IndexError, "string index out of range");
4308 return -1;
4309 }
Victor Stinner488fa492011-12-12 00:01:39 +01004310 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004311 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004312 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4313 PyErr_SetString(PyExc_ValueError, "character out of range");
4314 return -1;
4315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004316 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4317 index, ch);
4318 return 0;
4319}
4320
Alexander Belopolsky40018472011-02-26 01:02:56 +00004321const char *
4322PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004323{
Victor Stinner42cb4622010-09-01 19:39:01 +00004324 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004325}
4326
Victor Stinner554f3f02010-06-16 23:33:54 +00004327/* create or adjust a UnicodeDecodeError */
4328static void
4329make_decode_exception(PyObject **exceptionObject,
4330 const char *encoding,
4331 const char *input, Py_ssize_t length,
4332 Py_ssize_t startpos, Py_ssize_t endpos,
4333 const char *reason)
4334{
4335 if (*exceptionObject == NULL) {
4336 *exceptionObject = PyUnicodeDecodeError_Create(
4337 encoding, input, length, startpos, endpos, reason);
4338 }
4339 else {
4340 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4341 goto onError;
4342 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4343 goto onError;
4344 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4345 goto onError;
4346 }
4347 return;
4348
4349onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004350 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004351}
4352
Steve Dowercc16be82016-09-08 10:35:16 -07004353#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004354static int
4355widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4356{
4357 if (newsize > *size) {
4358 wchar_t *newbuf = *buf;
4359 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4360 PyErr_NoMemory();
4361 return -1;
4362 }
4363 *buf = newbuf;
4364 }
4365 *size = newsize;
4366 return 0;
4367}
4368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369/* error handling callback helper:
4370 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004371 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372 and adjust various state variables.
4373 return 0 on success, -1 on error
4374*/
4375
Alexander Belopolsky40018472011-02-26 01:02:56 +00004376static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004377unicode_decode_call_errorhandler_wchar(
4378 const char *errors, PyObject **errorHandler,
4379 const char *encoding, const char *reason,
4380 const char **input, const char **inend, Py_ssize_t *startinpos,
4381 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004382 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004384 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385
4386 PyObject *restuple = NULL;
4387 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004388 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004389 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004390 Py_ssize_t requiredsize;
4391 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004392 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004393 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394
4395 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 *errorHandler = PyCodec_LookupError(errors);
4397 if (*errorHandler == NULL)
4398 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399 }
4400
Victor Stinner554f3f02010-06-16 23:33:54 +00004401 make_decode_exception(exceptionObject,
4402 encoding,
4403 *input, *inend - *input,
4404 *startinpos, *endinpos,
4405 reason);
4406 if (*exceptionObject == NULL)
4407 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408
Petr Viktorinffd97532020-02-11 17:46:57 +01004409 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004413 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004416 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004418
4419 /* Copy back the bytes variables, which might have been modified by the
4420 callback */
4421 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4422 if (!inputobj)
4423 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 *input = PyBytes_AS_STRING(inputobj);
4425 insize = PyBytes_GET_SIZE(inputobj);
4426 *inend = *input + insize;
4427 /* we can DECREF safely, as the exception has another reference,
4428 so the object won't go away. */
4429 Py_DECREF(inputobj);
4430
4431 if (newpos<0)
4432 newpos = insize+newpos;
4433 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004434 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004435 goto onError;
4436 }
4437
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004438#if USE_UNICODE_WCHAR_CACHE
4439_Py_COMP_DIAG_PUSH
4440_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4441 repwlen = PyUnicode_GetSize(repunicode);
4442 if (repwlen < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443 goto onError;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004444_Py_COMP_DIAG_POP
4445#else /* USE_UNICODE_WCHAR_CACHE */
4446 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4447 if (repwlen < 0)
4448 goto onError;
4449 repwlen--;
4450#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004451 /* need more space? (at least enough for what we
4452 have+the replacement+the rest of the string (starting
4453 at the new input position), so we won't have to check space
4454 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004455 requiredsize = *outpos;
4456 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4457 goto overflow;
4458 requiredsize += repwlen;
4459 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4460 goto overflow;
4461 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004462 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004463 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004464 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004465 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004466 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004467 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004468 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004469 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004470 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004471 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004472 *endinpos = newpos;
4473 *inptr = *input + newpos;
4474
4475 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004476 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477 return 0;
4478
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004479 overflow:
4480 PyErr_SetString(PyExc_OverflowError,
4481 "decoded result is too long for a Python string");
4482
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004483 onError:
4484 Py_XDECREF(restuple);
4485 return -1;
4486}
Steve Dowercc16be82016-09-08 10:35:16 -07004487#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004488
4489static int
4490unicode_decode_call_errorhandler_writer(
4491 const char *errors, PyObject **errorHandler,
4492 const char *encoding, const char *reason,
4493 const char **input, const char **inend, Py_ssize_t *startinpos,
4494 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4495 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4496{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004497 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004498
4499 PyObject *restuple = NULL;
4500 PyObject *repunicode = NULL;
4501 Py_ssize_t insize;
4502 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004503 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004504 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004505 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004506 int need_to_grow = 0;
4507 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004508
4509 if (*errorHandler == NULL) {
4510 *errorHandler = PyCodec_LookupError(errors);
4511 if (*errorHandler == NULL)
4512 goto onError;
4513 }
4514
4515 make_decode_exception(exceptionObject,
4516 encoding,
4517 *input, *inend - *input,
4518 *startinpos, *endinpos,
4519 reason);
4520 if (*exceptionObject == NULL)
4521 goto onError;
4522
Petr Viktorinffd97532020-02-11 17:46:57 +01004523 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004524 if (restuple == NULL)
4525 goto onError;
4526 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004527 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004528 goto onError;
4529 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004530 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004531 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004532
4533 /* Copy back the bytes variables, which might have been modified by the
4534 callback */
4535 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4536 if (!inputobj)
4537 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004538 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004539 *input = PyBytes_AS_STRING(inputobj);
4540 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004541 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004542 /* we can DECREF safely, as the exception has another reference,
4543 so the object won't go away. */
4544 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004547 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004548 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004549 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004551 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552
Victor Stinner170ca6f2013-04-18 00:25:28 +02004553 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004554 if (replen > 1) {
4555 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004556 need_to_grow = 1;
4557 }
4558 new_inptr = *input + newpos;
4559 if (*inend - new_inptr > remain) {
4560 /* We don't know the decoding algorithm here so we make the worst
4561 assumption that one byte decodes to one unicode character.
4562 If unfortunately one byte could decode to more unicode characters,
4563 the decoder may write out-of-bound then. Is it possible for the
4564 algorithms using this function? */
4565 writer->min_length += *inend - new_inptr - remain;
4566 need_to_grow = 1;
4567 }
4568 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004569 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004570 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004571 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4572 goto onError;
4573 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004574 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004575 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004578 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004579
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004581 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004582 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004586 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587}
4588
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004589/* --- UTF-7 Codec -------------------------------------------------------- */
4590
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591/* See RFC2152 for details. We encode conservatively and decode liberally. */
4592
4593/* Three simple macros defining base-64. */
4594
4595/* Is c a base-64 character? */
4596
4597#define IS_BASE64(c) \
4598 (((c) >= 'A' && (c) <= 'Z') || \
4599 ((c) >= 'a' && (c) <= 'z') || \
4600 ((c) >= '0' && (c) <= '9') || \
4601 (c) == '+' || (c) == '/')
4602
4603/* given that c is a base-64 character, what is its base-64 value? */
4604
4605#define FROM_BASE64(c) \
4606 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4607 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4608 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4609 (c) == '+' ? 62 : 63)
4610
4611/* What is the base-64 character of the bottom 6 bits of n? */
4612
4613#define TO_BASE64(n) \
4614 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4615
4616/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4617 * decoded as itself. We are permissive on decoding; the only ASCII
4618 * byte not decoding to itself is the + which begins a base64
4619 * string. */
4620
4621#define DECODE_DIRECT(c) \
4622 ((c) <= 127 && (c) != '+')
4623
4624/* The UTF-7 encoder treats ASCII characters differently according to
4625 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4626 * the above). See RFC2152. This array identifies these different
4627 * sets:
4628 * 0 : "Set D"
4629 * alphanumeric and '(),-./:?
4630 * 1 : "Set O"
4631 * !"#$%&*;<=>@[]^_`{|}
4632 * 2 : "whitespace"
4633 * ht nl cr sp
4634 * 3 : special (must be base64 encoded)
4635 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4636 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637
Tim Petersced69f82003-09-16 20:30:58 +00004638static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639char utf7_category[128] = {
4640/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4641 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4642/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4643 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4644/* sp ! " # $ % & ' ( ) * + , - . / */
4645 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4646/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4648/* @ A B C D E F G H I J K L M N O */
4649 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4650/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4651 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4652/* ` a b c d e f g h i j k l m n o */
4653 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4654/* p q r s t u v w x y z { | } ~ del */
4655 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656};
4657
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658/* ENCODE_DIRECT: this character should be encoded as itself. The
4659 * answer depends on whether we are encoding set O as itself, and also
4660 * on whether we are encoding whitespace as itself. RFC2152 makes it
4661 * clear that the answers to these questions vary between
4662 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004663
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664#define ENCODE_DIRECT(c, directO, directWS) \
4665 ((c) < 128 && (c) > 0 && \
4666 ((utf7_category[(c)] == 0) || \
4667 (directWS && (utf7_category[(c)] == 2)) || \
4668 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669
Alexander Belopolsky40018472011-02-26 01:02:56 +00004670PyObject *
4671PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004672 Py_ssize_t size,
4673 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004674{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004675 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4676}
4677
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678/* The decoder. The only state we preserve is our read position,
4679 * i.e. how many characters we have consumed. So if we end in the
4680 * middle of a shift sequence we have to back off the read position
4681 * and the output to the beginning of the sequence, otherwise we lose
4682 * all the shift state (seen bits, number of bits seen, high
4683 * surrogate). */
4684
Alexander Belopolsky40018472011-02-26 01:02:56 +00004685PyObject *
4686PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004687 Py_ssize_t size,
4688 const char *errors,
4689 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004690{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004692 Py_ssize_t startinpos;
4693 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004694 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004695 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004696 const char *errmsg = "";
4697 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004698 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 unsigned int base64bits = 0;
4700 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004701 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004702 PyObject *errorHandler = NULL;
4703 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004704
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004705 if (size == 0) {
4706 if (consumed)
4707 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004708 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004709 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004710
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004711 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004712 _PyUnicodeWriter_Init(&writer);
4713 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004714
4715 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004716 e = s + size;
4717
4718 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004719 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004721 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722
Antoine Pitrou244651a2009-05-04 18:56:13 +00004723 if (inShift) { /* in a base-64 section */
4724 if (IS_BASE64(ch)) { /* consume a base-64 character */
4725 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4726 base64bits += 6;
4727 s++;
4728 if (base64bits >= 16) {
4729 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004730 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004731 base64bits -= 16;
4732 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004733 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004734 if (surrogate) {
4735 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004736 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4737 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004738 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004739 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004740 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004741 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004742 }
4743 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004744 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004745 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004746 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004747 }
4748 }
Victor Stinner551ac952011-11-29 22:58:13 +01004749 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004750 /* first surrogate */
4751 surrogate = outCh;
4752 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004753 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004754 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004755 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004756 }
4757 }
4758 }
4759 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004760 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004761 if (base64bits > 0) { /* left-over bits */
4762 if (base64bits >= 6) {
4763 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004764 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 errmsg = "partial character in shift sequence";
4766 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004767 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004768 else {
4769 /* Some bits remain; they should be zero */
4770 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004771 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 errmsg = "non-zero padding bits in shift sequence";
4773 goto utf7Error;
4774 }
4775 }
4776 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004777 if (surrogate && DECODE_DIRECT(ch)) {
4778 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4779 goto onError;
4780 }
4781 surrogate = 0;
4782 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004783 /* '-' is absorbed; other terminating
4784 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004785 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004786 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004787 }
4788 }
4789 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 s++; /* consume '+' */
4792 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004793 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004794 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004795 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004796 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004797 else if (s < e && !IS_BASE64(*s)) {
4798 s++;
4799 errmsg = "ill-formed sequence";
4800 goto utf7Error;
4801 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004802 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004803 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004804 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004805 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004806 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004807 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004808 }
4809 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004810 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004811 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004812 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004813 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004814 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 else {
4816 startinpos = s-starts;
4817 s++;
4818 errmsg = "unexpected special character";
4819 goto utf7Error;
4820 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004821 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004824 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 errors, &errorHandler,
4826 "utf7", errmsg,
4827 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004828 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004829 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004830 }
4831
Antoine Pitrou244651a2009-05-04 18:56:13 +00004832 /* end of string */
4833
4834 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4835 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004836 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004837 if (surrogate ||
4838 (base64bits >= 6) ||
4839 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004840 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004841 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004842 errors, &errorHandler,
4843 "utf7", "unterminated shift sequence",
4844 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004845 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004846 goto onError;
4847 if (s < e)
4848 goto restart;
4849 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004850 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004851
4852 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004853 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004854 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004855 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004856 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004857 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004858 writer.kind, writer.data, shiftOutStart);
4859 Py_XDECREF(errorHandler);
4860 Py_XDECREF(exc);
4861 _PyUnicodeWriter_Dealloc(&writer);
4862 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004863 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004864 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004865 }
4866 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004867 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004868 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004869 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004870
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 Py_XDECREF(errorHandler);
4872 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004873 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004874
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 Py_XDECREF(errorHandler);
4877 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004878 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004879 return NULL;
4880}
4881
4882
Alexander Belopolsky40018472011-02-26 01:02:56 +00004883PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004884_PyUnicode_EncodeUTF7(PyObject *str,
4885 int base64SetO,
4886 int base64WhiteSpace,
4887 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004888{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004889 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004890 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004891 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004892 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004893 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004894 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004895 unsigned int base64bits = 0;
4896 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004897 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004898 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004899
Benjamin Petersonbac79492012-01-14 13:34:47 -05004900 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004901 return NULL;
4902 kind = PyUnicode_KIND(str);
4903 data = PyUnicode_DATA(str);
4904 len = PyUnicode_GET_LENGTH(str);
4905
4906 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004908
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004909 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004910 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004911 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004912 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004913 if (v == NULL)
4914 return NULL;
4915
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004916 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004917 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004918 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004919
Antoine Pitrou244651a2009-05-04 18:56:13 +00004920 if (inShift) {
4921 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4922 /* shifting out */
4923 if (base64bits) { /* output remaining bits */
4924 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4925 base64buffer = 0;
4926 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004927 }
4928 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004929 /* Characters not in the BASE64 set implicitly unshift the sequence
4930 so no '-' is required, except if the character is itself a '-' */
4931 if (IS_BASE64(ch) || ch == '-') {
4932 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004933 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004934 *out++ = (char) ch;
4935 }
4936 else {
4937 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004938 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004939 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004940 else { /* not in a shift sequence */
4941 if (ch == '+') {
4942 *out++ = '+';
4943 *out++ = '-';
4944 }
4945 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4946 *out++ = (char) ch;
4947 }
4948 else {
4949 *out++ = '+';
4950 inShift = 1;
4951 goto encode_char;
4952 }
4953 }
4954 continue;
4955encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004956 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004957 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004958
Antoine Pitrou244651a2009-05-04 18:56:13 +00004959 /* code first surrogate */
4960 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004961 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004962 while (base64bits >= 6) {
4963 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4964 base64bits -= 6;
4965 }
4966 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004967 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004968 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004969 base64bits += 16;
4970 base64buffer = (base64buffer << 16) | ch;
4971 while (base64bits >= 6) {
4972 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4973 base64bits -= 6;
4974 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004975 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004976 if (base64bits)
4977 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4978 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004979 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004980 if (_PyBytes_Resize(&v, out - start) < 0)
4981 return NULL;
4982 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004983}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004984PyObject *
4985PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4986 Py_ssize_t size,
4987 int base64SetO,
4988 int base64WhiteSpace,
4989 const char *errors)
4990{
4991 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004992 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004993 if (tmp == NULL)
4994 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004995 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004996 base64WhiteSpace, errors);
4997 Py_DECREF(tmp);
4998 return result;
4999}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005000
Antoine Pitrou244651a2009-05-04 18:56:13 +00005001#undef IS_BASE64
5002#undef FROM_BASE64
5003#undef TO_BASE64
5004#undef DECODE_DIRECT
5005#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005006
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007/* --- UTF-8 Codec -------------------------------------------------------- */
5008
Alexander Belopolsky40018472011-02-26 01:02:56 +00005009PyObject *
5010PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005011 Py_ssize_t size,
5012 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013{
Walter Dörwald69652032004-09-07 20:24:22 +00005014 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5015}
5016
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005017#include "stringlib/asciilib.h"
5018#include "stringlib/codecs.h"
5019#include "stringlib/undef.h"
5020
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005021#include "stringlib/ucs1lib.h"
5022#include "stringlib/codecs.h"
5023#include "stringlib/undef.h"
5024
5025#include "stringlib/ucs2lib.h"
5026#include "stringlib/codecs.h"
5027#include "stringlib/undef.h"
5028
5029#include "stringlib/ucs4lib.h"
5030#include "stringlib/codecs.h"
5031#include "stringlib/undef.h"
5032
Ma Lina0c603c2020-10-18 22:48:38 +08005033/* Mask to quickly check whether a C 'size_t' contains a
Antoine Pitrouab868312009-01-10 15:40:25 +00005034 non-ASCII, UTF8-encoded char. */
Ma Lina0c603c2020-10-18 22:48:38 +08005035#if (SIZEOF_SIZE_T == 8)
5036# define ASCII_CHAR_MASK 0x8080808080808080ULL
5037#elif (SIZEOF_SIZE_T == 4)
5038# define ASCII_CHAR_MASK 0x80808080U
Antoine Pitrouab868312009-01-10 15:40:25 +00005039#else
Ma Lina0c603c2020-10-18 22:48:38 +08005040# error C 'size_t' size should be either 4 or 8!
Antoine Pitrouab868312009-01-10 15:40:25 +00005041#endif
5042
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043static Py_ssize_t
5044ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005045{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005046 const char *p = start;
Ma Lina0c603c2020-10-18 22:48:38 +08005047 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_SIZE_T);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005048
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005049 /*
5050 * Issue #17237: m68k is a bit different from most architectures in
5051 * that objects do not use "natural alignment" - for example, int and
5052 * long are only aligned at 2-byte boundaries. Therefore the assert()
5053 * won't work; also, tests have shown that skipping the "optimised
5054 * version" will even speed up m68k.
5055 */
5056#if !defined(__m68k__)
Ma Lina0c603c2020-10-18 22:48:38 +08005057#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5058 assert(_Py_IS_ALIGNED(dest, SIZEOF_SIZE_T));
5059 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060 /* Fast path, see in STRINGLIB(utf8_decode) for
5061 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005062 /* Help allocation */
5063 const char *_p = p;
5064 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005065 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005066 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005067 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005069 *((size_t *)q) = value;
5070 _p += SIZEOF_SIZE_T;
5071 q += SIZEOF_SIZE_T;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005072 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005073 p = _p;
5074 while (p < end) {
5075 if ((unsigned char)*p & 0x80)
5076 break;
5077 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005082#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 while (p < end) {
5084 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5085 for an explanation. */
Ma Lina0c603c2020-10-18 22:48:38 +08005086 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005087 /* Help allocation */
5088 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005090 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005091 if (value & ASCII_CHAR_MASK)
5092 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005093 _p += SIZEOF_SIZE_T;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005094 }
5095 p = _p;
5096 if (_p == end)
5097 break;
5098 }
5099 if ((unsigned char)*p & 0x80)
5100 break;
5101 ++p;
5102 }
5103 memcpy(dest, start, p - start);
5104 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105}
Antoine Pitrouab868312009-01-10 15:40:25 +00005106
Victor Stinner709d23d2019-05-02 14:56:30 -04005107static PyObject *
5108unicode_decode_utf8(const char *s, Py_ssize_t size,
5109 _Py_error_handler error_handler, const char *errors,
5110 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005111{
Victor Stinner785938e2011-12-11 20:09:03 +01005112 if (size == 0) {
5113 if (consumed)
5114 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005115 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005116 }
5117
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005118 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5119 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005120 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005121 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005122 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005123 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005124 }
5125
Inada Naoki770847a2019-06-24 12:30:24 +09005126 const char *starts = s;
5127 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005128
Inada Naoki770847a2019-06-24 12:30:24 +09005129 // fast path: try ASCII string.
5130 PyObject *u = PyUnicode_New(size, 127);
5131 if (u == NULL) {
5132 return NULL;
5133 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005134 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005135 if (s == end) {
5136 return u;
5137 }
5138
5139 // Use _PyUnicodeWriter after fast path is failed.
5140 _PyUnicodeWriter writer;
5141 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5142 writer.pos = s - starts;
5143
5144 Py_ssize_t startinpos, endinpos;
5145 const char *errmsg = "";
5146 PyObject *error_handler_obj = NULL;
5147 PyObject *exc = NULL;
5148
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 while (s < end) {
5150 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005151 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005152
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005154 if (PyUnicode_IS_ASCII(writer.buffer))
5155 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005156 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005157 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005158 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005159 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005160 } else {
5161 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005162 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005163 }
5164
5165 switch (ch) {
5166 case 0:
5167 if (s == end || consumed)
5168 goto End;
5169 errmsg = "unexpected end of data";
5170 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005171 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005172 break;
5173 case 1:
5174 errmsg = "invalid start byte";
5175 startinpos = s - starts;
5176 endinpos = startinpos + 1;
5177 break;
5178 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005179 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5180 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5181 {
5182 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005183 goto End;
5184 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005185 /* fall through */
5186 case 3:
5187 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005188 errmsg = "invalid continuation byte";
5189 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005190 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005191 break;
5192 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005193 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005194 goto onError;
5195 continue;
5196 }
5197
Victor Stinner1d65d912015-10-05 13:43:50 +02005198 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005199 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005200
5201 switch (error_handler) {
5202 case _Py_ERROR_IGNORE:
5203 s += (endinpos - startinpos);
5204 break;
5205
5206 case _Py_ERROR_REPLACE:
5207 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5208 goto onError;
5209 s += (endinpos - startinpos);
5210 break;
5211
5212 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005213 {
5214 Py_ssize_t i;
5215
Victor Stinner1d65d912015-10-05 13:43:50 +02005216 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5217 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005218 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005219 ch = (Py_UCS4)(unsigned char)(starts[i]);
5220 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5221 ch + 0xdc00);
5222 writer.pos++;
5223 }
5224 s += (endinpos - startinpos);
5225 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005226 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005227
5228 default:
5229 if (unicode_decode_call_errorhandler_writer(
5230 errors, &error_handler_obj,
5231 "utf-8", errmsg,
5232 &starts, &end, &startinpos, &endinpos, &exc, &s,
5233 &writer))
5234 goto onError;
5235 }
Victor Stinner785938e2011-12-11 20:09:03 +01005236 }
5237
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005238End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005239 if (consumed)
5240 *consumed = s - starts;
5241
Victor Stinner1d65d912015-10-05 13:43:50 +02005242 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005243 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005245
5246onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005247 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005248 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005249 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005250 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005251}
5252
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005253
Victor Stinner709d23d2019-05-02 14:56:30 -04005254PyObject *
5255PyUnicode_DecodeUTF8Stateful(const char *s,
5256 Py_ssize_t size,
5257 const char *errors,
5258 Py_ssize_t *consumed)
5259{
5260 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5261}
5262
5263
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005264/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5265 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005266
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005267 On success, write a pointer to a newly allocated wide character string into
5268 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5269 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005270
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005271 On memory allocation failure, return -1.
5272
5273 On decoding error (if surrogateescape is zero), return -2. If wlen is
5274 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5275 is not NULL, write the decoding error message into *reason. */
5276int
5277_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005278 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005279{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005280 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005281 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005282 wchar_t *unicode;
5283 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005284
Victor Stinner3d4226a2018-08-29 22:21:32 +02005285 int surrogateescape = 0;
5286 int surrogatepass = 0;
5287 switch (errors)
5288 {
5289 case _Py_ERROR_STRICT:
5290 break;
5291 case _Py_ERROR_SURROGATEESCAPE:
5292 surrogateescape = 1;
5293 break;
5294 case _Py_ERROR_SURROGATEPASS:
5295 surrogatepass = 1;
5296 break;
5297 default:
5298 return -3;
5299 }
5300
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005301 /* Note: size will always be longer than the resulting Unicode
5302 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005303 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005304 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005305 }
5306
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005307 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005308 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005309 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005310 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005311
5312 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005313 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005314 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005315 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005316 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005317#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005318 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005319#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005320 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005321#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005322 if (ch > 0xFF) {
5323#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005324 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005325#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005326 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005327 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005328 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5329 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5330#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005331 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005332 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005333 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005334 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005335 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005336
5337 if (surrogateescape) {
5338 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5339 }
5340 else {
5341 /* Is it a valid three-byte code? */
5342 if (surrogatepass
5343 && (e - s) >= 3
5344 && (s[0] & 0xf0) == 0xe0
5345 && (s[1] & 0xc0) == 0x80
5346 && (s[2] & 0xc0) == 0x80)
5347 {
5348 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5349 s += 3;
5350 unicode[outpos++] = ch;
5351 }
5352 else {
5353 PyMem_RawFree(unicode );
5354 if (reason != NULL) {
5355 switch (ch) {
5356 case 0:
5357 *reason = "unexpected end of data";
5358 break;
5359 case 1:
5360 *reason = "invalid start byte";
5361 break;
5362 /* 2, 3, 4 */
5363 default:
5364 *reason = "invalid continuation byte";
5365 break;
5366 }
5367 }
5368 if (wlen != NULL) {
5369 *wlen = s - orig_s;
5370 }
5371 return -2;
5372 }
5373 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005374 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005375 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005376 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005377 if (wlen) {
5378 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005379 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005380 *wstr = unicode;
5381 return 0;
5382}
5383
Victor Stinner5f9cf232019-03-19 01:46:25 +01005384
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005385wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005386_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5387 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005388{
5389 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005390 int res = _Py_DecodeUTF8Ex(arg, arglen,
5391 &wstr, wlen,
5392 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005393 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005394 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5395 assert(res != -3);
5396 if (wlen) {
5397 *wlen = (size_t)res;
5398 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005399 return NULL;
5400 }
5401 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005402}
5403
Antoine Pitrouab868312009-01-10 15:40:25 +00005404
Victor Stinnere47e6982017-12-21 15:45:16 +01005405/* UTF-8 encoder using the surrogateescape error handler .
5406
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005407 On success, return 0 and write the newly allocated character string (use
5408 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005409
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005410 On encoding failure, return -2 and write the position of the invalid
5411 surrogate character into *error_pos (if error_pos is set) and the decoding
5412 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005413
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005414 On memory allocation failure, return -1. */
5415int
5416_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005417 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005418{
5419 const Py_ssize_t max_char_size = 4;
5420 Py_ssize_t len = wcslen(text);
5421
5422 assert(len >= 0);
5423
Victor Stinner3d4226a2018-08-29 22:21:32 +02005424 int surrogateescape = 0;
5425 int surrogatepass = 0;
5426 switch (errors)
5427 {
5428 case _Py_ERROR_STRICT:
5429 break;
5430 case _Py_ERROR_SURROGATEESCAPE:
5431 surrogateescape = 1;
5432 break;
5433 case _Py_ERROR_SURROGATEPASS:
5434 surrogatepass = 1;
5435 break;
5436 default:
5437 return -3;
5438 }
5439
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005440 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5441 return -1;
5442 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005443 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005444 if (raw_malloc) {
5445 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005446 }
5447 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005448 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005449 }
5450 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005451 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005452 }
5453
5454 char *p = bytes;
5455 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005456 for (i = 0; i < len; ) {
5457 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005458 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005459 i++;
5460#if Py_UNICODE_SIZE == 2
5461 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5462 && i < len
5463 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5464 {
5465 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5466 i++;
5467 }
5468#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005469
5470 if (ch < 0x80) {
5471 /* Encode ASCII */
5472 *p++ = (char) ch;
5473
5474 }
5475 else if (ch < 0x0800) {
5476 /* Encode Latin-1 */
5477 *p++ = (char)(0xc0 | (ch >> 6));
5478 *p++ = (char)(0x80 | (ch & 0x3f));
5479 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005480 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005481 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005482 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005483 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005484 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005485 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005486 if (reason != NULL) {
5487 *reason = "encoding error";
5488 }
5489 if (raw_malloc) {
5490 PyMem_RawFree(bytes);
5491 }
5492 else {
5493 PyMem_Free(bytes);
5494 }
5495 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005496 }
5497 *p++ = (char)(ch & 0xff);
5498 }
5499 else if (ch < 0x10000) {
5500 *p++ = (char)(0xe0 | (ch >> 12));
5501 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5502 *p++ = (char)(0x80 | (ch & 0x3f));
5503 }
5504 else { /* ch >= 0x10000 */
5505 assert(ch <= MAX_UNICODE);
5506 /* Encode UCS4 Unicode ordinals */
5507 *p++ = (char)(0xf0 | (ch >> 18));
5508 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5509 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5510 *p++ = (char)(0x80 | (ch & 0x3f));
5511 }
5512 }
5513 *p++ = '\0';
5514
5515 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005516 char *bytes2;
5517 if (raw_malloc) {
5518 bytes2 = PyMem_RawRealloc(bytes, final_size);
5519 }
5520 else {
5521 bytes2 = PyMem_Realloc(bytes, final_size);
5522 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005523 if (bytes2 == NULL) {
5524 if (error_pos != NULL) {
5525 *error_pos = (size_t)-1;
5526 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005527 if (raw_malloc) {
5528 PyMem_RawFree(bytes);
5529 }
5530 else {
5531 PyMem_Free(bytes);
5532 }
5533 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005534 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005535 *str = bytes2;
5536 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005537}
5538
5539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540/* Primary internal function which creates utf8 encoded bytes objects.
5541
5542 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005543 and allocate exactly as much space needed at the end. Else allocate the
5544 maximum possible needed (4 result bytes per Unicode character), and return
5545 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005546*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005547static PyObject *
5548unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5549 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551 if (!PyUnicode_Check(unicode)) {
5552 PyErr_BadArgument();
5553 return NULL;
5554 }
5555
5556 if (PyUnicode_READY(unicode) == -1)
5557 return NULL;
5558
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005559 if (PyUnicode_UTF8(unicode))
5560 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5561 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562
Inada Naoki02a4d572020-02-27 13:48:59 +09005563 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005564 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005565 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5566
5567 _PyBytesWriter writer;
5568 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005569
Benjamin Petersonead6b532011-12-20 17:23:42 -06005570 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005571 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005572 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005573 case PyUnicode_1BYTE_KIND:
5574 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5575 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005576 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5577 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005578 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005579 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5580 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005581 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005582 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5583 break;
Tim Peters602f7402002-04-27 18:03:26 +00005584 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005585
5586 if (end == NULL) {
5587 _PyBytesWriter_Dealloc(&writer);
5588 return NULL;
5589 }
5590 return _PyBytesWriter_Finish(&writer, end);
5591}
5592
5593static int
5594unicode_fill_utf8(PyObject *unicode)
5595{
5596 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5597 assert(!PyUnicode_IS_ASCII(unicode));
5598
5599 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005600 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005601 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5602
5603 _PyBytesWriter writer;
5604 char *end;
5605
5606 switch (kind) {
5607 default:
5608 Py_UNREACHABLE();
5609 case PyUnicode_1BYTE_KIND:
5610 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5611 _Py_ERROR_STRICT, NULL);
5612 break;
5613 case PyUnicode_2BYTE_KIND:
5614 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5615 _Py_ERROR_STRICT, NULL);
5616 break;
5617 case PyUnicode_4BYTE_KIND:
5618 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5619 _Py_ERROR_STRICT, NULL);
5620 break;
5621 }
5622 if (end == NULL) {
5623 _PyBytesWriter_Dealloc(&writer);
5624 return -1;
5625 }
5626
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005627 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005628 PyBytes_AS_STRING(writer.buffer);
5629 Py_ssize_t len = end - start;
5630
Victor Stinner32bd68c2020-12-01 10:37:39 +01005631 char *cache = PyObject_Malloc(len + 1);
Inada Naoki02a4d572020-02-27 13:48:59 +09005632 if (cache == NULL) {
5633 _PyBytesWriter_Dealloc(&writer);
5634 PyErr_NoMemory();
5635 return -1;
5636 }
5637 _PyUnicode_UTF8(unicode) = cache;
5638 _PyUnicode_UTF8_LENGTH(unicode) = len;
5639 memcpy(cache, start, len);
5640 cache[len] = '\0';
5641 _PyBytesWriter_Dealloc(&writer);
5642 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643}
5644
Alexander Belopolsky40018472011-02-26 01:02:56 +00005645PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005646_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5647{
5648 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5649}
5650
5651
5652PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005653PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5654 Py_ssize_t size,
5655 const char *errors)
5656{
5657 PyObject *v, *unicode;
5658
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005659 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005660 if (unicode == NULL)
5661 return NULL;
5662 v = _PyUnicode_AsUTF8String(unicode, errors);
5663 Py_DECREF(unicode);
5664 return v;
5665}
5666
5667PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005668PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671}
5672
Walter Dörwald41980ca2007-08-16 21:55:45 +00005673/* --- UTF-32 Codec ------------------------------------------------------- */
5674
5675PyObject *
5676PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 Py_ssize_t size,
5678 const char *errors,
5679 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005680{
5681 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5682}
5683
5684PyObject *
5685PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 Py_ssize_t size,
5687 const char *errors,
5688 int *byteorder,
5689 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005690{
5691 const char *starts = s;
5692 Py_ssize_t startinpos;
5693 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005694 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005695 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005696 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005697 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005698 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005699 PyObject *errorHandler = NULL;
5700 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005701
Andy Lestere6be9b52020-02-11 20:28:35 -06005702 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005703 e = q + size;
5704
5705 if (byteorder)
5706 bo = *byteorder;
5707
5708 /* Check for BOM marks (U+FEFF) in the input and adjust current
5709 byte order setting accordingly. In native mode, the leading BOM
5710 mark is skipped, in all other modes, it is copied to the output
5711 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005712 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005713 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005714 if (bom == 0x0000FEFF) {
5715 bo = -1;
5716 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005718 else if (bom == 0xFFFE0000) {
5719 bo = 1;
5720 q += 4;
5721 }
5722 if (byteorder)
5723 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005724 }
5725
Victor Stinnere64322e2012-10-30 23:12:47 +01005726 if (q == e) {
5727 if (consumed)
5728 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005729 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005730 }
5731
Victor Stinnere64322e2012-10-30 23:12:47 +01005732#ifdef WORDS_BIGENDIAN
5733 le = bo < 0;
5734#else
5735 le = bo <= 0;
5736#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005737 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005738
Victor Stinner8f674cc2013-04-17 23:02:17 +02005739 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005740 writer.min_length = (e - q + 3) / 4;
5741 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005742 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005743
Victor Stinnere64322e2012-10-30 23:12:47 +01005744 while (1) {
5745 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005746 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005747
Victor Stinnere64322e2012-10-30 23:12:47 +01005748 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005749 enum PyUnicode_Kind kind = writer.kind;
5750 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005751 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005752 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005753 if (le) {
5754 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005755 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005756 if (ch > maxch)
5757 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 if (kind != PyUnicode_1BYTE_KIND &&
5759 Py_UNICODE_IS_SURROGATE(ch))
5760 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005761 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005762 q += 4;
5763 } while (q <= last);
5764 }
5765 else {
5766 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005767 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005768 if (ch > maxch)
5769 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005770 if (kind != PyUnicode_1BYTE_KIND &&
5771 Py_UNICODE_IS_SURROGATE(ch))
5772 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005773 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005774 q += 4;
5775 } while (q <= last);
5776 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005777 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005778 }
5779
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005780 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005781 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005782 startinpos = ((const char *)q) - starts;
5783 endinpos = startinpos + 4;
5784 }
5785 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005786 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005788 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005790 startinpos = ((const char *)q) - starts;
5791 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005793 else {
5794 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005795 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005796 goto onError;
5797 q += 4;
5798 continue;
5799 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005800 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005801 startinpos = ((const char *)q) - starts;
5802 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005804
5805 /* The remaining input chars are ignored if the callback
5806 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005807 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005809 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005811 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005813 }
5814
Walter Dörwald41980ca2007-08-16 21:55:45 +00005815 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005816 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005817
Walter Dörwald41980ca2007-08-16 21:55:45 +00005818 Py_XDECREF(errorHandler);
5819 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005820 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005821
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005823 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005824 Py_XDECREF(errorHandler);
5825 Py_XDECREF(exc);
5826 return NULL;
5827}
5828
5829PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005830_PyUnicode_EncodeUTF32(PyObject *str,
5831 const char *errors,
5832 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005833{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005834 enum PyUnicode_Kind kind;
5835 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005836 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005837 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005838 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005839#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005840 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005841#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005842 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005843#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005844 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005845 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005846 PyObject *errorHandler = NULL;
5847 PyObject *exc = NULL;
5848 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005849
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005850 if (!PyUnicode_Check(str)) {
5851 PyErr_BadArgument();
5852 return NULL;
5853 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005854 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005855 return NULL;
5856 kind = PyUnicode_KIND(str);
5857 data = PyUnicode_DATA(str);
5858 len = PyUnicode_GET_LENGTH(str);
5859
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005860 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005861 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005862 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005863 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005864 if (v == NULL)
5865 return NULL;
5866
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005867 /* output buffer is 4-bytes aligned */
5868 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005869 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005870 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005871 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005872 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005873 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005874
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005875 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005876 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005877 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005878 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005879 else
5880 encoding = "utf-32";
5881
5882 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005883 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5884 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005885 }
5886
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005887 pos = 0;
5888 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005889 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005890
5891 if (kind == PyUnicode_2BYTE_KIND) {
5892 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5893 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005894 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005895 else {
5896 assert(kind == PyUnicode_4BYTE_KIND);
5897 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5898 &out, native_ordering);
5899 }
5900 if (pos == len)
5901 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005902
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005903 rep = unicode_encode_call_errorhandler(
5904 errors, &errorHandler,
5905 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005906 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005907 if (!rep)
5908 goto error;
5909
5910 if (PyBytes_Check(rep)) {
5911 repsize = PyBytes_GET_SIZE(rep);
5912 if (repsize & 3) {
5913 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005914 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005915 "surrogates not allowed");
5916 goto error;
5917 }
5918 moreunits = repsize / 4;
5919 }
5920 else {
5921 assert(PyUnicode_Check(rep));
5922 if (PyUnicode_READY(rep) < 0)
5923 goto error;
5924 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5925 if (!PyUnicode_IS_ASCII(rep)) {
5926 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005927 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005928 "surrogates not allowed");
5929 goto error;
5930 }
5931 }
5932
5933 /* four bytes are reserved for each surrogate */
5934 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005935 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005936 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005937 /* integer overflow */
5938 PyErr_NoMemory();
5939 goto error;
5940 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005941 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005942 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005943 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005944 }
5945
5946 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005947 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005948 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005949 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005950 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005951 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5952 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005953 }
5954
5955 Py_CLEAR(rep);
5956 }
5957
5958 /* Cut back to size actually needed. This is necessary for, for example,
5959 encoding of a string containing isolated surrogates and the 'ignore'
5960 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005961 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005962 if (nsize != PyBytes_GET_SIZE(v))
5963 _PyBytes_Resize(&v, nsize);
5964 Py_XDECREF(errorHandler);
5965 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005966 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005967 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005968 error:
5969 Py_XDECREF(rep);
5970 Py_XDECREF(errorHandler);
5971 Py_XDECREF(exc);
5972 Py_XDECREF(v);
5973 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005974}
5975
Alexander Belopolsky40018472011-02-26 01:02:56 +00005976PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005977PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5978 Py_ssize_t size,
5979 const char *errors,
5980 int byteorder)
5981{
5982 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005983 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005984 if (tmp == NULL)
5985 return NULL;
5986 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5987 Py_DECREF(tmp);
5988 return result;
5989}
5990
5991PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005992PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005993{
Victor Stinnerb960b342011-11-20 19:12:52 +01005994 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005995}
5996
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997/* --- UTF-16 Codec ------------------------------------------------------- */
5998
Tim Peters772747b2001-08-09 22:21:55 +00005999PyObject *
6000PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 Py_ssize_t size,
6002 const char *errors,
6003 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004{
Walter Dörwald69652032004-09-07 20:24:22 +00006005 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6006}
6007
6008PyObject *
6009PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 Py_ssize_t size,
6011 const char *errors,
6012 int *byteorder,
6013 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00006014{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006016 Py_ssize_t startinpos;
6017 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006018 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006019 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006020 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006021 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006022 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 PyObject *errorHandler = NULL;
6024 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006025 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026
Andy Lestere6be9b52020-02-11 20:28:35 -06006027 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006028 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
6030 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006031 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006033 /* Check for BOM marks (U+FEFF) in the input and adjust current
6034 byte order setting accordingly. In native mode, the leading BOM
6035 mark is skipped, in all other modes, it is copied to the output
6036 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006037 if (bo == 0 && size >= 2) {
6038 const Py_UCS4 bom = (q[1] << 8) | q[0];
6039 if (bom == 0xFEFF) {
6040 q += 2;
6041 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006043 else if (bom == 0xFFFE) {
6044 q += 2;
6045 bo = 1;
6046 }
6047 if (byteorder)
6048 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
Antoine Pitrou63065d72012-05-15 23:48:04 +02006051 if (q == e) {
6052 if (consumed)
6053 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006054 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006055 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006056
Christian Heimes743e0cd2012-10-17 23:52:17 +02006057#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006058 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006059 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006060#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006061 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006062 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006063#endif
Tim Peters772747b2001-08-09 22:21:55 +00006064
Antoine Pitrou63065d72012-05-15 23:48:04 +02006065 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006066 character count normally. Error handler will take care of
6067 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006068 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006069 writer.min_length = (e - q + 1) / 2;
6070 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006071 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006072
Antoine Pitrou63065d72012-05-15 23:48:04 +02006073 while (1) {
6074 Py_UCS4 ch = 0;
6075 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006076 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006077 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006078 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006079 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006080 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006081 native_ordering);
6082 else
6083 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006084 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006085 native_ordering);
6086 } else if (kind == PyUnicode_2BYTE_KIND) {
6087 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006088 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006089 native_ordering);
6090 } else {
6091 assert(kind == PyUnicode_4BYTE_KIND);
6092 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006093 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006094 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006095 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006096 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006097
Antoine Pitrou63065d72012-05-15 23:48:04 +02006098 switch (ch)
6099 {
6100 case 0:
6101 /* remaining byte at the end? (size should be even) */
6102 if (q == e || consumed)
6103 goto End;
6104 errmsg = "truncated data";
6105 startinpos = ((const char *)q) - starts;
6106 endinpos = ((const char *)e) - starts;
6107 break;
6108 /* The remaining input chars are ignored if the callback
6109 chooses to skip the input */
6110 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006111 q -= 2;
6112 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006113 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006114 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006115 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006116 endinpos = ((const char *)e) - starts;
6117 break;
6118 case 2:
6119 errmsg = "illegal encoding";
6120 startinpos = ((const char *)q) - 2 - starts;
6121 endinpos = startinpos + 2;
6122 break;
6123 case 3:
6124 errmsg = "illegal UTF-16 surrogate";
6125 startinpos = ((const char *)q) - 4 - starts;
6126 endinpos = startinpos + 2;
6127 break;
6128 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006129 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006130 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 continue;
6132 }
6133
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006134 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006135 errors,
6136 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006137 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006138 &starts,
6139 (const char **)&e,
6140 &startinpos,
6141 &endinpos,
6142 &exc,
6143 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006144 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 }
6147
Antoine Pitrou63065d72012-05-15 23:48:04 +02006148End:
Walter Dörwald69652032004-09-07 20:24:22 +00006149 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006152 Py_XDECREF(errorHandler);
6153 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006154 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006157 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 Py_XDECREF(errorHandler);
6159 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 return NULL;
6161}
6162
Tim Peters772747b2001-08-09 22:21:55 +00006163PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164_PyUnicode_EncodeUTF16(PyObject *str,
6165 const char *errors,
6166 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006168 enum PyUnicode_Kind kind;
6169 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006171 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006172 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006173 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006174#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006175 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006176#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006177 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006178#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006179 const char *encoding;
6180 Py_ssize_t nsize, pos;
6181 PyObject *errorHandler = NULL;
6182 PyObject *exc = NULL;
6183 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006184
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 if (!PyUnicode_Check(str)) {
6186 PyErr_BadArgument();
6187 return NULL;
6188 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006189 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006190 return NULL;
6191 kind = PyUnicode_KIND(str);
6192 data = PyUnicode_DATA(str);
6193 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006194
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006196 if (kind == PyUnicode_4BYTE_KIND) {
6197 const Py_UCS4 *in = (const Py_UCS4 *)data;
6198 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006199 while (in < end) {
6200 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006201 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006202 }
6203 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006204 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006205 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006207 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006208 nsize = len + pairs + (byteorder == 0);
6209 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006210 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006214 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006215 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006216 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006217 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006218 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006219 }
6220 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006221 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006222 }
Tim Peters772747b2001-08-09 22:21:55 +00006223
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006224 if (kind == PyUnicode_1BYTE_KIND) {
6225 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6226 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006227 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006228
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006229 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006230 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006231 }
6232 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006233 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006234 }
6235 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006236 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006237 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006238
6239 pos = 0;
6240 while (pos < len) {
6241 Py_ssize_t repsize, moreunits;
6242
6243 if (kind == PyUnicode_2BYTE_KIND) {
6244 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6245 &out, native_ordering);
6246 }
6247 else {
6248 assert(kind == PyUnicode_4BYTE_KIND);
6249 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6250 &out, native_ordering);
6251 }
6252 if (pos == len)
6253 break;
6254
6255 rep = unicode_encode_call_errorhandler(
6256 errors, &errorHandler,
6257 encoding, "surrogates not allowed",
6258 str, &exc, pos, pos + 1, &pos);
6259 if (!rep)
6260 goto error;
6261
6262 if (PyBytes_Check(rep)) {
6263 repsize = PyBytes_GET_SIZE(rep);
6264 if (repsize & 1) {
6265 raise_encode_exception(&exc, encoding,
6266 str, pos - 1, pos,
6267 "surrogates not allowed");
6268 goto error;
6269 }
6270 moreunits = repsize / 2;
6271 }
6272 else {
6273 assert(PyUnicode_Check(rep));
6274 if (PyUnicode_READY(rep) < 0)
6275 goto error;
6276 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6277 if (!PyUnicode_IS_ASCII(rep)) {
6278 raise_encode_exception(&exc, encoding,
6279 str, pos - 1, pos,
6280 "surrogates not allowed");
6281 goto error;
6282 }
6283 }
6284
6285 /* two bytes are reserved for each surrogate */
6286 if (moreunits > 1) {
6287 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006288 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006289 /* integer overflow */
6290 PyErr_NoMemory();
6291 goto error;
6292 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006293 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006294 goto error;
6295 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6296 }
6297
6298 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006299 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006300 out += moreunits;
6301 } else /* rep is unicode */ {
6302 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6303 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6304 &out, native_ordering);
6305 }
6306
6307 Py_CLEAR(rep);
6308 }
6309
6310 /* Cut back to size actually needed. This is necessary for, for example,
6311 encoding of a string containing isolated surrogates and the 'ignore' handler
6312 is used. */
6313 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6314 if (nsize != PyBytes_GET_SIZE(v))
6315 _PyBytes_Resize(&v, nsize);
6316 Py_XDECREF(errorHandler);
6317 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006318 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006319 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006320 error:
6321 Py_XDECREF(rep);
6322 Py_XDECREF(errorHandler);
6323 Py_XDECREF(exc);
6324 Py_XDECREF(v);
6325 return NULL;
6326#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327}
6328
Alexander Belopolsky40018472011-02-26 01:02:56 +00006329PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006330PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6331 Py_ssize_t size,
6332 const char *errors,
6333 int byteorder)
6334{
6335 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006336 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006337 if (tmp == NULL)
6338 return NULL;
6339 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6340 Py_DECREF(tmp);
6341 return result;
6342}
6343
6344PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006345PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006347 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348}
6349
6350/* --- Unicode Escape Codec ----------------------------------------------- */
6351
Victor Stinner47e1afd2020-10-26 16:43:47 +01006352static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006353
Alexander Belopolsky40018472011-02-26 01:02:56 +00006354PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006355_PyUnicode_DecodeUnicodeEscape(const char *s,
6356 Py_ssize_t size,
6357 const char *errors,
6358 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006361 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363 PyObject *errorHandler = NULL;
6364 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006365
Eric V. Smith42454af2016-10-31 09:22:08 -04006366 // so we can remember if we've seen an invalid escape char or not
6367 *first_invalid_escape = NULL;
6368
Victor Stinner62ec3312016-09-06 17:04:34 -07006369 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006370 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006371 }
6372 /* Escaped strings will always be longer than the resulting
6373 Unicode string, so we start with size here and then reduce the
6374 length after conversion to the true value.
6375 (but if the error callback returns a long replacement string
6376 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006377 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006378 writer.min_length = size;
6379 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6380 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006381 }
6382
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 end = s + size;
6384 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006385 unsigned char c = (unsigned char) *s++;
6386 Py_UCS4 ch;
6387 int count;
6388 Py_ssize_t startinpos;
6389 Py_ssize_t endinpos;
6390 const char *message;
6391
6392#define WRITE_ASCII_CHAR(ch) \
6393 do { \
6394 assert(ch <= 127); \
6395 assert(writer.pos < writer.size); \
6396 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6397 } while(0)
6398
6399#define WRITE_CHAR(ch) \
6400 do { \
6401 if (ch <= writer.maxchar) { \
6402 assert(writer.pos < writer.size); \
6403 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6404 } \
6405 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6406 goto onError; \
6407 } \
6408 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409
6410 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 if (c != '\\') {
6412 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 continue;
6414 }
6415
Victor Stinner62ec3312016-09-06 17:04:34 -07006416 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 if (s >= end) {
6419 message = "\\ at end of string";
6420 goto error;
6421 }
6422 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006423
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006425 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006428 case '\n': continue;
6429 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6430 case '\'': WRITE_ASCII_CHAR('\''); continue;
6431 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6432 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006433 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6435 case 't': WRITE_ASCII_CHAR('\t'); continue;
6436 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6437 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006438 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006439 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006440 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 case '0': case '1': case '2': case '3':
6445 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006446 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006447 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 ch = (ch<<3) + *s++ - '0';
6449 if (s < end && '0' <= *s && *s <= '7') {
6450 ch = (ch<<3) + *s++ - '0';
6451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006453 WRITE_CHAR(ch);
6454 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 /* hex escapes */
6457 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006459 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006460 message = "truncated \\xXX escape";
6461 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006466 message = "truncated \\uXXXX escape";
6467 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006470 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006472 message = "truncated \\UXXXXXXXX escape";
6473 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006474 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006475 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 ch <<= 4;
6477 if (c >= '0' && c <= '9') {
6478 ch += c - '0';
6479 }
6480 else if (c >= 'a' && c <= 'f') {
6481 ch += c - ('a' - 10);
6482 }
6483 else if (c >= 'A' && c <= 'F') {
6484 ch += c - ('A' - 10);
6485 }
6486 else {
6487 break;
6488 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006489 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006491 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006492 }
6493
6494 /* when we get here, ch is a 32-bit unicode character */
6495 if (ch > MAX_UNICODE) {
6496 message = "illegal Unicode character";
6497 goto error;
6498 }
6499
6500 WRITE_CHAR(ch);
6501 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006502
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006504 case 'N':
Victor Stinner47e1afd2020-10-26 16:43:47 +01006505 if (ucnhash_capi == NULL) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006506 /* load the unicode data module */
Victor Stinner47e1afd2020-10-26 16:43:47 +01006507 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006508 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner47e1afd2020-10-26 16:43:47 +01006509 if (ucnhash_capi == NULL) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006510 PyErr_SetString(
6511 PyExc_UnicodeError,
6512 "\\N escapes not supported (can't load unicodedata module)"
6513 );
6514 goto onError;
6515 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006516 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006517
6518 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006519 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006520 const char *start = ++s;
6521 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006522 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006523 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006524 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006525 namelen = s - start;
6526 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006527 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006528 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006529 ch = 0xffffffff; /* in case 'getcode' messes up */
6530 if (namelen <= INT_MAX &&
Victor Stinner920cb642020-10-26 19:19:36 +01006531 ucnhash_capi->getcode(start, (int)namelen,
Victor Stinner62ec3312016-09-06 17:04:34 -07006532 &ch, 0)) {
6533 assert(ch <= MAX_UNICODE);
6534 WRITE_CHAR(ch);
6535 continue;
6536 }
6537 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006538 }
6539 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006540 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006541
6542 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006543 if (*first_invalid_escape == NULL) {
6544 *first_invalid_escape = s-1; /* Back up one char, since we've
6545 already incremented s. */
6546 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006547 WRITE_ASCII_CHAR('\\');
6548 WRITE_CHAR(c);
6549 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006551
6552 error:
6553 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006554 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006555 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006556 errors, &errorHandler,
6557 "unicodeescape", message,
6558 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006559 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006560 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006561 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006562 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006563
6564#undef WRITE_ASCII_CHAR
6565#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006567
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006568 Py_XDECREF(errorHandler);
6569 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006570 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006571
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006573 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006574 Py_XDECREF(errorHandler);
6575 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 return NULL;
6577}
6578
Eric V. Smith42454af2016-10-31 09:22:08 -04006579PyObject *
6580PyUnicode_DecodeUnicodeEscape(const char *s,
6581 Py_ssize_t size,
6582 const char *errors)
6583{
6584 const char *first_invalid_escape;
6585 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6586 &first_invalid_escape);
6587 if (result == NULL)
6588 return NULL;
6589 if (first_invalid_escape != NULL) {
6590 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6591 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006592 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006593 Py_DECREF(result);
6594 return NULL;
6595 }
6596 }
6597 return result;
6598}
6599
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006600/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601
Alexander Belopolsky40018472011-02-26 01:02:56 +00006602PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006603PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006605 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006606 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006608 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006609 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006610 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
Ezio Melottie7f90372012-10-05 03:33:31 +03006612 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006613 escape.
6614
Ezio Melottie7f90372012-10-05 03:33:31 +03006615 For UCS1 strings it's '\xxx', 4 bytes per source character.
6616 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6617 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006618 */
6619
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006620 if (!PyUnicode_Check(unicode)) {
6621 PyErr_BadArgument();
6622 return NULL;
6623 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006624 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006625 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006626 }
Victor Stinner358af132015-10-12 22:36:57 +02006627
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006628 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006629 if (len == 0) {
6630 return PyBytes_FromStringAndSize(NULL, 0);
6631 }
6632
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006633 kind = PyUnicode_KIND(unicode);
6634 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006635 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6636 bytes, and 1 byte characters 4. */
6637 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006638 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006639 return PyErr_NoMemory();
6640 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006641 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006642 if (repr == NULL) {
6643 return NULL;
6644 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006645
Victor Stinner62ec3312016-09-06 17:04:34 -07006646 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006647 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006648 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006649
Victor Stinner62ec3312016-09-06 17:04:34 -07006650 /* U+0000-U+00ff range */
6651 if (ch < 0x100) {
6652 if (ch >= ' ' && ch < 127) {
6653 if (ch != '\\') {
6654 /* Copy printable US ASCII as-is */
6655 *p++ = (char) ch;
6656 }
6657 /* Escape backslashes */
6658 else {
6659 *p++ = '\\';
6660 *p++ = '\\';
6661 }
6662 }
Victor Stinner358af132015-10-12 22:36:57 +02006663
Victor Stinner62ec3312016-09-06 17:04:34 -07006664 /* Map special whitespace to '\t', \n', '\r' */
6665 else if (ch == '\t') {
6666 *p++ = '\\';
6667 *p++ = 't';
6668 }
6669 else if (ch == '\n') {
6670 *p++ = '\\';
6671 *p++ = 'n';
6672 }
6673 else if (ch == '\r') {
6674 *p++ = '\\';
6675 *p++ = 'r';
6676 }
6677
6678 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6679 else {
6680 *p++ = '\\';
6681 *p++ = 'x';
6682 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6683 *p++ = Py_hexdigits[ch & 0x000F];
6684 }
Tim Petersced69f82003-09-16 20:30:58 +00006685 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006686 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006687 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 *p++ = '\\';
6689 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006690 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6691 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6692 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6693 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006695 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6696 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006697
Victor Stinner62ec3312016-09-06 17:04:34 -07006698 /* Make sure that the first two digits are zero */
6699 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006700 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006701 *p++ = 'U';
6702 *p++ = '0';
6703 *p++ = '0';
6704 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6705 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6706 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6707 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6708 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6709 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712
Victor Stinner62ec3312016-09-06 17:04:34 -07006713 assert(p - PyBytes_AS_STRING(repr) > 0);
6714 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6715 return NULL;
6716 }
6717 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718}
6719
Alexander Belopolsky40018472011-02-26 01:02:56 +00006720PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006721PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6722 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006724 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006725 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006726 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006728 }
6729
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006730 result = PyUnicode_AsUnicodeEscapeString(tmp);
6731 Py_DECREF(tmp);
6732 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733}
6734
6735/* --- Raw Unicode Escape Codec ------------------------------------------- */
6736
Alexander Belopolsky40018472011-02-26 01:02:56 +00006737PyObject *
6738PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006739 Py_ssize_t size,
6740 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006743 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006745 PyObject *errorHandler = NULL;
6746 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006747
Victor Stinner62ec3312016-09-06 17:04:34 -07006748 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006749 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006750 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006751
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 /* Escaped strings will always be longer than the resulting
6753 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754 length after conversion to the true value. (But decoding error
6755 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006756 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006757 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006758 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6759 goto onError;
6760 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006761
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 end = s + size;
6763 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006764 unsigned char c = (unsigned char) *s++;
6765 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006766 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006767 Py_ssize_t startinpos;
6768 Py_ssize_t endinpos;
6769 const char *message;
6770
6771#define WRITE_CHAR(ch) \
6772 do { \
6773 if (ch <= writer.maxchar) { \
6774 assert(writer.pos < writer.size); \
6775 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6776 } \
6777 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6778 goto onError; \
6779 } \
6780 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006783 if (c != '\\' || s >= end) {
6784 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006786 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006787
Victor Stinner62ec3312016-09-06 17:04:34 -07006788 c = (unsigned char) *s++;
6789 if (c == 'u') {
6790 count = 4;
6791 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006793 else if (c == 'U') {
6794 count = 8;
6795 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006796 }
6797 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006798 assert(writer.pos < writer.size);
6799 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6800 WRITE_CHAR(c);
6801 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006802 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006803 startinpos = s - starts - 2;
6804
6805 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6806 for (ch = 0; count && s < end; ++s, --count) {
6807 c = (unsigned char)*s;
6808 ch <<= 4;
6809 if (c >= '0' && c <= '9') {
6810 ch += c - '0';
6811 }
6812 else if (c >= 'a' && c <= 'f') {
6813 ch += c - ('a' - 10);
6814 }
6815 else if (c >= 'A' && c <= 'F') {
6816 ch += c - ('A' - 10);
6817 }
6818 else {
6819 break;
6820 }
6821 }
6822 if (!count) {
6823 if (ch <= MAX_UNICODE) {
6824 WRITE_CHAR(ch);
6825 continue;
6826 }
6827 message = "\\Uxxxxxxxx out of range";
6828 }
6829
6830 endinpos = s-starts;
6831 writer.min_length = end - s + writer.pos;
6832 if (unicode_decode_call_errorhandler_writer(
6833 errors, &errorHandler,
6834 "rawunicodeescape", message,
6835 &starts, &end, &startinpos, &endinpos, &exc, &s,
6836 &writer)) {
6837 goto onError;
6838 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006839 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006840
6841#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006843 Py_XDECREF(errorHandler);
6844 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006845 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006846
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006848 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006849 Py_XDECREF(errorHandler);
6850 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006852
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853}
6854
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006855
Alexander Belopolsky40018472011-02-26 01:02:56 +00006856PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006857PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858{
Victor Stinner62ec3312016-09-06 17:04:34 -07006859 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006861 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006862 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006863 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006864 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006866 if (!PyUnicode_Check(unicode)) {
6867 PyErr_BadArgument();
6868 return NULL;
6869 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006870 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006871 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006872 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006873 kind = PyUnicode_KIND(unicode);
6874 data = PyUnicode_DATA(unicode);
6875 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006876 if (kind == PyUnicode_1BYTE_KIND) {
6877 return PyBytes_FromStringAndSize(data, len);
6878 }
Victor Stinner0e368262011-11-10 20:12:49 +01006879
Victor Stinner62ec3312016-09-06 17:04:34 -07006880 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6881 bytes, and 1 byte characters 4. */
6882 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006883
Victor Stinner62ec3312016-09-06 17:04:34 -07006884 if (len > PY_SSIZE_T_MAX / expandsize) {
6885 return PyErr_NoMemory();
6886 }
6887 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6888 if (repr == NULL) {
6889 return NULL;
6890 }
6891 if (len == 0) {
6892 return repr;
6893 }
6894
6895 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006896 for (pos = 0; pos < len; pos++) {
6897 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006898
Victor Stinner62ec3312016-09-06 17:04:34 -07006899 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6900 if (ch < 0x100) {
6901 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006902 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006903 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006904 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 *p++ = '\\';
6906 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006907 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6908 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6909 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6910 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006912 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6913 else {
6914 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6915 *p++ = '\\';
6916 *p++ = 'U';
6917 *p++ = '0';
6918 *p++ = '0';
6919 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6920 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6921 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6922 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6923 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6924 *p++ = Py_hexdigits[ch & 15];
6925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006927
Victor Stinner62ec3312016-09-06 17:04:34 -07006928 assert(p > PyBytes_AS_STRING(repr));
6929 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6930 return NULL;
6931 }
6932 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933}
6934
Alexander Belopolsky40018472011-02-26 01:02:56 +00006935PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006936PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6937 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006939 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006940 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006941 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006942 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006943 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6944 Py_DECREF(tmp);
6945 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946}
6947
6948/* --- Latin-1 Codec ------------------------------------------------------ */
6949
Alexander Belopolsky40018472011-02-26 01:02:56 +00006950PyObject *
6951PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006952 Py_ssize_t size,
6953 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006956 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957}
6958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006960static void
6961make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006962 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006963 PyObject *unicode,
6964 Py_ssize_t startpos, Py_ssize_t endpos,
6965 const char *reason)
6966{
6967 if (*exceptionObject == NULL) {
6968 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006969 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006970 encoding, unicode, startpos, endpos, reason);
6971 }
6972 else {
6973 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6974 goto onError;
6975 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6976 goto onError;
6977 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6978 goto onError;
6979 return;
6980 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006981 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006982 }
6983}
6984
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006985/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006986static void
6987raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006988 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006989 PyObject *unicode,
6990 Py_ssize_t startpos, Py_ssize_t endpos,
6991 const char *reason)
6992{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006993 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006994 encoding, unicode, startpos, endpos, reason);
6995 if (*exceptionObject != NULL)
6996 PyCodec_StrictErrors(*exceptionObject);
6997}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006998
6999/* error handling callback helper:
7000 build arguments, call the callback and check the arguments,
7001 put the result into newpos and return the replacement string, which
7002 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007003static PyObject *
7004unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007005 PyObject **errorHandler,
7006 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007007 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007008 Py_ssize_t startpos, Py_ssize_t endpos,
7009 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007010{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02007011 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007012 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 PyObject *restuple;
7014 PyObject *resunicode;
7015
7016 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007018 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007020 }
7021
Benjamin Petersonbac79492012-01-14 13:34:47 -05007022 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007023 return NULL;
7024 len = PyUnicode_GET_LENGTH(unicode);
7025
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007026 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007027 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007028 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007030
Petr Viktorinffd97532020-02-11 17:46:57 +01007031 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007032 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007034 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007035 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 Py_DECREF(restuple);
7037 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007038 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007039 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 &resunicode, newpos)) {
7041 Py_DECREF(restuple);
7042 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007043 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007044 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7045 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7046 Py_DECREF(restuple);
7047 return NULL;
7048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007049 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007050 *newpos = len + *newpos;
7051 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007052 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 Py_DECREF(restuple);
7054 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007055 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056 Py_INCREF(resunicode);
7057 Py_DECREF(restuple);
7058 return resunicode;
7059}
7060
Alexander Belopolsky40018472011-02-26 01:02:56 +00007061static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007062unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007063 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007064 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007065{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007066 /* input state */
7067 Py_ssize_t pos=0, size;
7068 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007069 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007070 /* pointer into the output */
7071 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007072 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7073 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007074 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007075 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007076 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007077 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007078 /* output object */
7079 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007080
Benjamin Petersonbac79492012-01-14 13:34:47 -05007081 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007082 return NULL;
7083 size = PyUnicode_GET_LENGTH(unicode);
7084 kind = PyUnicode_KIND(unicode);
7085 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007086 /* allocate enough for a simple encoding without
7087 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007088 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007089 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007090
7091 _PyBytesWriter_Init(&writer);
7092 str = _PyBytesWriter_Alloc(&writer, size);
7093 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007094 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007095
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007096 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007097 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007098
Benjamin Peterson29060642009-01-31 22:14:21 +00007099 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007100 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007102 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007103 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007104 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007106 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007107 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007108 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007109 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007110 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007111
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007112 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007114
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007115 /* Only overallocate the buffer if it's not the last write */
7116 writer.overallocate = (collend < size);
7117
Benjamin Peterson29060642009-01-31 22:14:21 +00007118 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007119 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007120 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007121
7122 switch (error_handler) {
7123 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007124 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007126
7127 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007128 memset(str, '?', collend - collstart);
7129 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007130 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007131 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007132 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 break;
Victor Stinner50149202015-09-22 00:26:54 +02007134
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007135 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007136 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007137 writer.min_size -= (collend - collstart);
7138 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007139 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007140 if (str == NULL)
7141 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007142 pos = collend;
7143 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007144
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007145 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007146 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007147 writer.min_size -= (collend - collstart);
7148 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007149 unicode, collstart, collend);
7150 if (str == NULL)
7151 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007152 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007153 break;
Victor Stinner50149202015-09-22 00:26:54 +02007154
Victor Stinnerc3713e92015-09-29 12:32:13 +02007155 case _Py_ERROR_SURROGATEESCAPE:
7156 for (i = collstart; i < collend; ++i) {
7157 ch = PyUnicode_READ(kind, data, i);
7158 if (ch < 0xdc80 || 0xdcff < ch) {
7159 /* Not a UTF-8b surrogate */
7160 break;
7161 }
7162 *str++ = (char)(ch - 0xdc00);
7163 ++pos;
7164 }
7165 if (i >= collend)
7166 break;
7167 collstart = pos;
7168 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007169 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007170
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007172 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7173 encoding, reason, unicode, &exc,
7174 collstart, collend, &newpos);
7175 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007177
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007178 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007179 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007180
Victor Stinner6bd525b2015-10-09 13:10:05 +02007181 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007182 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007183 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007184 PyBytes_AS_STRING(rep),
7185 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007186 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007187 else {
7188 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007189
Victor Stinner6bd525b2015-10-09 13:10:05 +02007190 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007192
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007193 if (limit == 256 ?
7194 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7195 !PyUnicode_IS_ASCII(rep))
7196 {
7197 /* Not all characters are smaller than limit */
7198 raise_encode_exception(&exc, encoding, unicode,
7199 collstart, collend, reason);
7200 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007202 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7203 str = _PyBytesWriter_WriteBytes(&writer, str,
7204 PyUnicode_DATA(rep),
7205 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007207 if (str == NULL)
7208 goto onError;
7209
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007210 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007211 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007212 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007213
7214 /* If overallocation was disabled, ensure that it was the last
7215 write. Otherwise, we missed an optimization */
7216 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007217 }
7218 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007219
Victor Stinner50149202015-09-22 00:26:54 +02007220 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007221 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007222 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007223
7224 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007225 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007226 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007227 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007228 Py_XDECREF(exc);
7229 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007230}
7231
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007232/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007233PyObject *
7234PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007235 Py_ssize_t size,
7236 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007238 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007239 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007240 if (unicode == NULL)
7241 return NULL;
7242 result = unicode_encode_ucs1(unicode, errors, 256);
7243 Py_DECREF(unicode);
7244 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245}
7246
Alexander Belopolsky40018472011-02-26 01:02:56 +00007247PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007248_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249{
7250 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 PyErr_BadArgument();
7252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007254 if (PyUnicode_READY(unicode) == -1)
7255 return NULL;
7256 /* Fast path: if it is a one-byte string, construct
7257 bytes object directly. */
7258 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7259 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7260 PyUnicode_GET_LENGTH(unicode));
7261 /* Non-Latin-1 characters present. Defer to above function to
7262 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007263 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007264}
7265
7266PyObject*
7267PyUnicode_AsLatin1String(PyObject *unicode)
7268{
7269 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270}
7271
7272/* --- 7-bit ASCII Codec -------------------------------------------------- */
7273
Alexander Belopolsky40018472011-02-26 01:02:56 +00007274PyObject *
7275PyUnicode_DecodeASCII(const char *s,
7276 Py_ssize_t size,
7277 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007279 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007280 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007281 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007282 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007283 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007284
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007286 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007287
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007289 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007290 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007292
Inada Naoki770847a2019-06-24 12:30:24 +09007293 // Shortcut for simple case
7294 PyObject *u = PyUnicode_New(size, 127);
7295 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007296 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007297 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007298 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007299 if (outpos == size) {
7300 return u;
7301 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007302
Inada Naoki770847a2019-06-24 12:30:24 +09007303 _PyUnicodeWriter writer;
7304 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007305 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007306
Inada Naoki770847a2019-06-24 12:30:24 +09007307 s += outpos;
7308 int kind = writer.kind;
7309 void *data = writer.data;
7310 Py_ssize_t startinpos, endinpos;
7311
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007312 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007313 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007315 PyUnicode_WRITE(kind, data, writer.pos, c);
7316 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007318 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007320
7321 /* byte outsize range 0x00..0x7f: call the error handler */
7322
7323 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007324 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007325
7326 switch (error_handler)
7327 {
7328 case _Py_ERROR_REPLACE:
7329 case _Py_ERROR_SURROGATEESCAPE:
7330 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007331 but we may switch to UCS2 at the first write */
7332 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7333 goto onError;
7334 kind = writer.kind;
7335 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007336
7337 if (error_handler == _Py_ERROR_REPLACE)
7338 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7339 else
7340 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7341 writer.pos++;
7342 ++s;
7343 break;
7344
7345 case _Py_ERROR_IGNORE:
7346 ++s;
7347 break;
7348
7349 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 startinpos = s-starts;
7351 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007352 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007353 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007354 "ascii", "ordinal not in range(128)",
7355 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007356 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007358 kind = writer.kind;
7359 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007362 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007363 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007364 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007365
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007367 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007368 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007369 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370 return NULL;
7371}
7372
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007373/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007374PyObject *
7375PyUnicode_EncodeASCII(const Py_UNICODE *p,
7376 Py_ssize_t size,
7377 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007379 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007380 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007381 if (unicode == NULL)
7382 return NULL;
7383 result = unicode_encode_ucs1(unicode, errors, 128);
7384 Py_DECREF(unicode);
7385 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386}
7387
Alexander Belopolsky40018472011-02-26 01:02:56 +00007388PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007389_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390{
7391 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 PyErr_BadArgument();
7393 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007395 if (PyUnicode_READY(unicode) == -1)
7396 return NULL;
7397 /* Fast path: if it is an ASCII-only string, construct bytes object
7398 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007399 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007400 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7401 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007402 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007403}
7404
7405PyObject *
7406PyUnicode_AsASCIIString(PyObject *unicode)
7407{
7408 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409}
7410
Steve Dowercc16be82016-09-08 10:35:16 -07007411#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007412
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007413/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007414
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007415#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007416#define NEED_RETRY
7417#endif
7418
Steve Dower7ebdda02019-08-21 16:22:33 -07007419/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7420 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7421 both cases also and avoids partial characters overrunning the
7422 length limit in MultiByteToWideChar on Windows */
7423#define DECODING_CHUNK_SIZE (INT_MAX/4)
7424
Victor Stinner3a50e702011-10-18 21:21:00 +02007425#ifndef WC_ERR_INVALID_CHARS
7426# define WC_ERR_INVALID_CHARS 0x0080
7427#endif
7428
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007429static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007430code_page_name(UINT code_page, PyObject **obj)
7431{
7432 *obj = NULL;
7433 if (code_page == CP_ACP)
7434 return "mbcs";
7435 if (code_page == CP_UTF7)
7436 return "CP_UTF7";
7437 if (code_page == CP_UTF8)
7438 return "CP_UTF8";
7439
7440 *obj = PyBytes_FromFormat("cp%u", code_page);
7441 if (*obj == NULL)
7442 return NULL;
7443 return PyBytes_AS_STRING(*obj);
7444}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007445
Victor Stinner3a50e702011-10-18 21:21:00 +02007446static DWORD
7447decode_code_page_flags(UINT code_page)
7448{
7449 if (code_page == CP_UTF7) {
7450 /* The CP_UTF7 decoder only supports flags=0 */
7451 return 0;
7452 }
7453 else
7454 return MB_ERR_INVALID_CHARS;
7455}
7456
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007457/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 * Decode a byte string from a Windows code page into unicode object in strict
7459 * mode.
7460 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007461 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7462 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007463 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007464static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007465decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007466 wchar_t **buf,
7467 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 const char *in,
7469 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007470{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007471 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007472 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007474
7475 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007477 while ((outsize = MultiByteToWideChar(code_page, flags,
7478 in, insize, NULL, 0)) <= 0)
7479 {
7480 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7481 goto error;
7482 }
7483 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7484 flags = 0;
7485 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007487 /* Extend a wchar_t* buffer */
7488 Py_ssize_t n = *bufsize; /* Get the current length */
7489 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7490 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007491 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007492 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007493
7494 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7496 if (outsize <= 0)
7497 goto error;
7498 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007499
Victor Stinner3a50e702011-10-18 21:21:00 +02007500error:
7501 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7502 return -2;
7503 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007504 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007505}
7506
Victor Stinner3a50e702011-10-18 21:21:00 +02007507/*
7508 * Decode a byte string from a code page into unicode object with an error
7509 * handler.
7510 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007511 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 * UnicodeDecodeError exception and returns -1 on error.
7513 */
7514static int
7515decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007516 wchar_t **buf,
7517 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007518 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007519 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007520{
7521 const char *startin = in;
7522 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007523 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 /* Ideally, we should get reason from FormatMessage. This is the Windows
7525 2000 English version of the message. */
7526 const char *reason = "No mapping for the Unicode character exists "
7527 "in the target code page.";
7528 /* each step cannot decode more than 1 character, but a character can be
7529 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007530 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007531 int insize;
7532 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 PyObject *errorHandler = NULL;
7534 PyObject *exc = NULL;
7535 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007536 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 DWORD err;
7538 int ret = -1;
7539
7540 assert(size > 0);
7541
7542 encoding = code_page_name(code_page, &encoding_obj);
7543 if (encoding == NULL)
7544 return -1;
7545
Victor Stinner7d00cc12014-03-17 23:08:06 +01007546 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7548 UnicodeDecodeError. */
7549 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7550 if (exc != NULL) {
7551 PyCodec_StrictErrors(exc);
7552 Py_CLEAR(exc);
7553 }
7554 goto error;
7555 }
7556
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007557 /* Extend a wchar_t* buffer */
7558 Py_ssize_t n = *bufsize; /* Get the current length */
7559 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7560 PyErr_NoMemory();
7561 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007562 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007563 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7564 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007566 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007567
7568 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007569 while (in < endin)
7570 {
7571 /* Decode a character */
7572 insize = 1;
7573 do
7574 {
7575 outsize = MultiByteToWideChar(code_page, flags,
7576 in, insize,
7577 buffer, Py_ARRAY_LENGTH(buffer));
7578 if (outsize > 0)
7579 break;
7580 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007581 if (err == ERROR_INVALID_FLAGS && flags) {
7582 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7583 flags = 0;
7584 continue;
7585 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 if (err != ERROR_NO_UNICODE_TRANSLATION
7587 && err != ERROR_INSUFFICIENT_BUFFER)
7588 {
7589 PyErr_SetFromWindowsErr(0);
7590 goto error;
7591 }
7592 insize++;
7593 }
7594 /* 4=maximum length of a UTF-8 sequence */
7595 while (insize <= 4 && (in + insize) <= endin);
7596
7597 if (outsize <= 0) {
7598 Py_ssize_t startinpos, endinpos, outpos;
7599
Victor Stinner7d00cc12014-03-17 23:08:06 +01007600 /* last character in partial decode? */
7601 if (in + insize >= endin && !final)
7602 break;
7603
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 startinpos = in - startin;
7605 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007606 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007607 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007608 errors, &errorHandler,
7609 encoding, reason,
7610 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007611 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007612 {
7613 goto error;
7614 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007615 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007616 }
7617 else {
7618 in += insize;
7619 memcpy(out, buffer, outsize * sizeof(wchar_t));
7620 out += outsize;
7621 }
7622 }
7623
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007624 /* Shrink the buffer */
7625 assert(out - *buf <= *bufsize);
7626 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007627 /* (in - startin) <= size and size is an int */
7628 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007629
7630error:
7631 Py_XDECREF(encoding_obj);
7632 Py_XDECREF(errorHandler);
7633 Py_XDECREF(exc);
7634 return ret;
7635}
7636
Victor Stinner3a50e702011-10-18 21:21:00 +02007637static PyObject *
7638decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007639 const char *s, Py_ssize_t size,
7640 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007641{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007642 wchar_t *buf = NULL;
7643 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007644 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007645
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 if (code_page < 0) {
7647 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7648 return NULL;
7649 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007650 if (size < 0) {
7651 PyErr_BadInternalCall();
7652 return NULL;
7653 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007654
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007655 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007657
Victor Stinner76a31a62011-11-04 00:05:13 +01007658 do
7659 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007660#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007661 if (size > DECODING_CHUNK_SIZE) {
7662 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007663 final = 0;
7664 done = 0;
7665 }
7666 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007667#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007668 {
7669 chunk_size = (int)size;
7670 final = (consumed == NULL);
7671 done = 1;
7672 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007673
Victor Stinner76a31a62011-11-04 00:05:13 +01007674 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007675 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007676 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007677 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007678 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007679
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007680 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007681 s, chunk_size);
7682 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007683 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007684 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007685 errors, final);
7686 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007687
7688 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007689 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007690 return NULL;
7691 }
7692
7693 if (consumed)
7694 *consumed += converted;
7695
7696 s += converted;
7697 size -= converted;
7698 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007699
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007700 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7701 PyMem_Free(buf);
7702 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007703}
7704
Alexander Belopolsky40018472011-02-26 01:02:56 +00007705PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007706PyUnicode_DecodeCodePageStateful(int code_page,
7707 const char *s,
7708 Py_ssize_t size,
7709 const char *errors,
7710 Py_ssize_t *consumed)
7711{
7712 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7713}
7714
7715PyObject *
7716PyUnicode_DecodeMBCSStateful(const char *s,
7717 Py_ssize_t size,
7718 const char *errors,
7719 Py_ssize_t *consumed)
7720{
7721 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7722}
7723
7724PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007725PyUnicode_DecodeMBCS(const char *s,
7726 Py_ssize_t size,
7727 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007728{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007729 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7730}
7731
Victor Stinner3a50e702011-10-18 21:21:00 +02007732static DWORD
7733encode_code_page_flags(UINT code_page, const char *errors)
7734{
7735 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007736 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007737 }
7738 else if (code_page == CP_UTF7) {
7739 /* CP_UTF7 only supports flags=0 */
7740 return 0;
7741 }
7742 else {
7743 if (errors != NULL && strcmp(errors, "replace") == 0)
7744 return 0;
7745 else
7746 return WC_NO_BEST_FIT_CHARS;
7747 }
7748}
7749
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007750/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007751 * Encode a Unicode string to a Windows code page into a byte string in strict
7752 * mode.
7753 *
7754 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007755 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007756 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007757static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007758encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007759 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007760 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007761{
Victor Stinner554f3f02010-06-16 23:33:54 +00007762 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007763 BOOL *pusedDefaultChar = &usedDefaultChar;
7764 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007765 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007766 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007767 const DWORD flags = encode_code_page_flags(code_page, NULL);
7768 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007769 /* Create a substring so that we can get the UTF-16 representation
7770 of just the slice under consideration. */
7771 PyObject *substring;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007772 int ret = -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007773
Martin v. Löwis3d325192011-11-04 18:23:06 +01007774 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007775
Victor Stinner3a50e702011-10-18 21:21:00 +02007776 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007777 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007778 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007779 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007780
Victor Stinner2fc507f2011-11-04 20:06:39 +01007781 substring = PyUnicode_Substring(unicode, offset, offset+len);
7782 if (substring == NULL)
7783 return -1;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007784#if USE_UNICODE_WCHAR_CACHE
7785_Py_COMP_DIAG_PUSH
7786_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner2fc507f2011-11-04 20:06:39 +01007787 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7788 if (p == NULL) {
7789 Py_DECREF(substring);
7790 return -1;
7791 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007792_Py_COMP_DIAG_POP
7793#else /* USE_UNICODE_WCHAR_CACHE */
7794 p = PyUnicode_AsWideCharString(substring, &size);
7795 Py_CLEAR(substring);
7796 if (p == NULL) {
7797 return -1;
7798 }
7799#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinner9f067f42013-06-05 00:21:31 +02007800 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007801
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007802 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007803 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007804 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007805 NULL, 0,
7806 NULL, pusedDefaultChar);
7807 if (outsize <= 0)
7808 goto error;
7809 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007810 if (pusedDefaultChar && *pusedDefaultChar) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007811 ret = -2;
7812 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007813 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007814
Victor Stinner3a50e702011-10-18 21:21:00 +02007815 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007817 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007818 if (*outbytes == NULL) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007819 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007820 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007821 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007822 }
7823 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007825 const Py_ssize_t n = PyBytes_Size(*outbytes);
7826 if (outsize > PY_SSIZE_T_MAX - n) {
7827 PyErr_NoMemory();
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007828 goto done;
Victor Stinner3a50e702011-10-18 21:21:00 +02007829 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007830 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007831 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007832 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007833 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007834 }
7835
7836 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007837 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007838 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007839 out, outsize,
7840 NULL, pusedDefaultChar);
7841 if (outsize <= 0)
7842 goto error;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007843 if (pusedDefaultChar && *pusedDefaultChar) {
7844 ret = -2;
7845 goto done;
7846 }
7847 ret = 0;
7848
7849done:
7850#if USE_UNICODE_WCHAR_CACHE
7851 Py_DECREF(substring);
7852#else /* USE_UNICODE_WCHAR_CACHE */
7853 PyMem_Free(p);
7854#endif /* USE_UNICODE_WCHAR_CACHE */
7855 return ret;
Victor Stinner554f3f02010-06-16 23:33:54 +00007856
Victor Stinner3a50e702011-10-18 21:21:00 +02007857error:
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007858 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7859 ret = -2;
7860 goto done;
7861 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007862 PyErr_SetFromWindowsErr(0);
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007863 goto done;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007864}
7865
Victor Stinner3a50e702011-10-18 21:21:00 +02007866/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007867 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007868 * error handler.
7869 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007870 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007871 * -1 on other error.
7872 */
7873static int
7874encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007875 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007876 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007877{
Victor Stinner3a50e702011-10-18 21:21:00 +02007878 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007879 Py_ssize_t pos = unicode_offset;
7880 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007881 /* Ideally, we should get reason from FormatMessage. This is the Windows
7882 2000 English version of the message. */
7883 const char *reason = "invalid character";
7884 /* 4=maximum length of a UTF-8 sequence */
7885 char buffer[4];
7886 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7887 Py_ssize_t outsize;
7888 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007889 PyObject *errorHandler = NULL;
7890 PyObject *exc = NULL;
7891 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007892 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007893 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007894 PyObject *rep;
7895 int ret = -1;
7896
7897 assert(insize > 0);
7898
7899 encoding = code_page_name(code_page, &encoding_obj);
7900 if (encoding == NULL)
7901 return -1;
7902
7903 if (errors == NULL || strcmp(errors, "strict") == 0) {
7904 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7905 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007906 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007907 if (exc != NULL) {
7908 PyCodec_StrictErrors(exc);
7909 Py_DECREF(exc);
7910 }
7911 Py_XDECREF(encoding_obj);
7912 return -1;
7913 }
7914
7915 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7916 pusedDefaultChar = &usedDefaultChar;
7917 else
7918 pusedDefaultChar = NULL;
7919
7920 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7921 PyErr_NoMemory();
7922 goto error;
7923 }
7924 outsize = insize * Py_ARRAY_LENGTH(buffer);
7925
7926 if (*outbytes == NULL) {
7927 /* Create string object */
7928 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7929 if (*outbytes == NULL)
7930 goto error;
7931 out = PyBytes_AS_STRING(*outbytes);
7932 }
7933 else {
7934 /* Extend string object */
7935 Py_ssize_t n = PyBytes_Size(*outbytes);
7936 if (n > PY_SSIZE_T_MAX - outsize) {
7937 PyErr_NoMemory();
7938 goto error;
7939 }
7940 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7941 goto error;
7942 out = PyBytes_AS_STRING(*outbytes) + n;
7943 }
7944
7945 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007946 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007947 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007948 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7949 wchar_t chars[2];
7950 int charsize;
7951 if (ch < 0x10000) {
7952 chars[0] = (wchar_t)ch;
7953 charsize = 1;
7954 }
7955 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007956 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7957 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007958 charsize = 2;
7959 }
7960
Victor Stinner3a50e702011-10-18 21:21:00 +02007961 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007962 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007963 buffer, Py_ARRAY_LENGTH(buffer),
7964 NULL, pusedDefaultChar);
7965 if (outsize > 0) {
7966 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7967 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007968 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007969 memcpy(out, buffer, outsize);
7970 out += outsize;
7971 continue;
7972 }
7973 }
7974 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7975 PyErr_SetFromWindowsErr(0);
7976 goto error;
7977 }
7978
Victor Stinner3a50e702011-10-18 21:21:00 +02007979 rep = unicode_encode_call_errorhandler(
7980 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007981 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007982 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007983 if (rep == NULL)
7984 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007985 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007986
7987 if (PyBytes_Check(rep)) {
7988 outsize = PyBytes_GET_SIZE(rep);
7989 if (outsize != 1) {
7990 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7991 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7992 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7993 Py_DECREF(rep);
7994 goto error;
7995 }
7996 out = PyBytes_AS_STRING(*outbytes) + offset;
7997 }
7998 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7999 out += outsize;
8000 }
8001 else {
8002 Py_ssize_t i;
8003 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008004 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02008005
Benjamin Petersonbac79492012-01-14 13:34:47 -05008006 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02008007 Py_DECREF(rep);
8008 goto error;
8009 }
8010
8011 outsize = PyUnicode_GET_LENGTH(rep);
8012 if (outsize != 1) {
8013 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8014 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8015 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8016 Py_DECREF(rep);
8017 goto error;
8018 }
8019 out = PyBytes_AS_STRING(*outbytes) + offset;
8020 }
8021 kind = PyUnicode_KIND(rep);
8022 data = PyUnicode_DATA(rep);
8023 for (i=0; i < outsize; i++) {
8024 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8025 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008026 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008027 encoding, unicode,
8028 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02008029 "unable to encode error handler result to ASCII");
8030 Py_DECREF(rep);
8031 goto error;
8032 }
8033 *out = (unsigned char)ch;
8034 out++;
8035 }
8036 }
8037 Py_DECREF(rep);
8038 }
8039 /* write a NUL byte */
8040 *out = 0;
8041 outsize = out - PyBytes_AS_STRING(*outbytes);
8042 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8043 if (_PyBytes_Resize(outbytes, outsize) < 0)
8044 goto error;
8045 ret = 0;
8046
8047error:
8048 Py_XDECREF(encoding_obj);
8049 Py_XDECREF(errorHandler);
8050 Py_XDECREF(exc);
8051 return ret;
8052}
8053
Victor Stinner3a50e702011-10-18 21:21:00 +02008054static PyObject *
8055encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008056 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008057 const char *errors)
8058{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008059 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008060 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008061 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008062 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008063
Victor Stinner29dacf22015-01-26 16:41:32 +01008064 if (!PyUnicode_Check(unicode)) {
8065 PyErr_BadArgument();
8066 return NULL;
8067 }
8068
Benjamin Petersonbac79492012-01-14 13:34:47 -05008069 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008070 return NULL;
8071 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008072
Victor Stinner3a50e702011-10-18 21:21:00 +02008073 if (code_page < 0) {
8074 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8075 return NULL;
8076 }
8077
Martin v. Löwis3d325192011-11-04 18:23:06 +01008078 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008079 return PyBytes_FromStringAndSize(NULL, 0);
8080
Victor Stinner7581cef2011-11-03 22:32:33 +01008081 offset = 0;
8082 do
8083 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008084#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008085 if (len > DECODING_CHUNK_SIZE) {
8086 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008087 done = 0;
8088 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008089 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008090#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008091 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008092 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008093 done = 1;
8094 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008095
Victor Stinner76a31a62011-11-04 00:05:13 +01008096 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008097 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008098 errors);
8099 if (ret == -2)
8100 ret = encode_code_page_errors(code_page, &outbytes,
8101 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008102 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008103 if (ret < 0) {
8104 Py_XDECREF(outbytes);
8105 return NULL;
8106 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008107
Victor Stinner7581cef2011-11-03 22:32:33 +01008108 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008109 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008110 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008111
Victor Stinner3a50e702011-10-18 21:21:00 +02008112 return outbytes;
8113}
8114
8115PyObject *
8116PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8117 Py_ssize_t size,
8118 const char *errors)
8119{
Victor Stinner7581cef2011-11-03 22:32:33 +01008120 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008121 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008122 if (unicode == NULL)
8123 return NULL;
8124 res = encode_code_page(CP_ACP, unicode, errors);
8125 Py_DECREF(unicode);
8126 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008127}
8128
8129PyObject *
8130PyUnicode_EncodeCodePage(int code_page,
8131 PyObject *unicode,
8132 const char *errors)
8133{
Victor Stinner7581cef2011-11-03 22:32:33 +01008134 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008135}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008136
Alexander Belopolsky40018472011-02-26 01:02:56 +00008137PyObject *
8138PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008139{
Victor Stinner7581cef2011-11-03 22:32:33 +01008140 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008141}
8142
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008143#undef NEED_RETRY
8144
Steve Dowercc16be82016-09-08 10:35:16 -07008145#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008146
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147/* --- Character Mapping Codec -------------------------------------------- */
8148
Victor Stinnerfb161b12013-04-18 01:44:27 +02008149static int
8150charmap_decode_string(const char *s,
8151 Py_ssize_t size,
8152 PyObject *mapping,
8153 const char *errors,
8154 _PyUnicodeWriter *writer)
8155{
8156 const char *starts = s;
8157 const char *e;
8158 Py_ssize_t startinpos, endinpos;
8159 PyObject *errorHandler = NULL, *exc = NULL;
8160 Py_ssize_t maplen;
8161 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008162 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008163 Py_UCS4 x;
8164 unsigned char ch;
8165
8166 if (PyUnicode_READY(mapping) == -1)
8167 return -1;
8168
8169 maplen = PyUnicode_GET_LENGTH(mapping);
8170 mapdata = PyUnicode_DATA(mapping);
8171 mapkind = PyUnicode_KIND(mapping);
8172
8173 e = s + size;
8174
8175 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8176 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8177 * is disabled in encoding aliases, latin1 is preferred because
8178 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008179 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008180 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8181 Py_UCS4 maxchar = writer->maxchar;
8182
8183 assert (writer->kind == PyUnicode_1BYTE_KIND);
8184 while (s < e) {
8185 ch = *s;
8186 x = mapdata_ucs1[ch];
8187 if (x > maxchar) {
8188 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8189 goto onError;
8190 maxchar = writer->maxchar;
8191 outdata = (Py_UCS1 *)writer->data;
8192 }
8193 outdata[writer->pos] = x;
8194 writer->pos++;
8195 ++s;
8196 }
8197 return 0;
8198 }
8199
8200 while (s < e) {
8201 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8202 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008203 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008204 if (outkind == PyUnicode_1BYTE_KIND) {
8205 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8206 Py_UCS4 maxchar = writer->maxchar;
8207 while (s < e) {
8208 ch = *s;
8209 x = mapdata_ucs2[ch];
8210 if (x > maxchar)
8211 goto Error;
8212 outdata[writer->pos] = x;
8213 writer->pos++;
8214 ++s;
8215 }
8216 break;
8217 }
8218 else if (outkind == PyUnicode_2BYTE_KIND) {
8219 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8220 while (s < e) {
8221 ch = *s;
8222 x = mapdata_ucs2[ch];
8223 if (x == 0xFFFE)
8224 goto Error;
8225 outdata[writer->pos] = x;
8226 writer->pos++;
8227 ++s;
8228 }
8229 break;
8230 }
8231 }
8232 ch = *s;
8233
8234 if (ch < maplen)
8235 x = PyUnicode_READ(mapkind, mapdata, ch);
8236 else
8237 x = 0xfffe; /* invalid value */
8238Error:
8239 if (x == 0xfffe)
8240 {
8241 /* undefined mapping */
8242 startinpos = s-starts;
8243 endinpos = startinpos+1;
8244 if (unicode_decode_call_errorhandler_writer(
8245 errors, &errorHandler,
8246 "charmap", "character maps to <undefined>",
8247 &starts, &e, &startinpos, &endinpos, &exc, &s,
8248 writer)) {
8249 goto onError;
8250 }
8251 continue;
8252 }
8253
8254 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8255 goto onError;
8256 ++s;
8257 }
8258 Py_XDECREF(errorHandler);
8259 Py_XDECREF(exc);
8260 return 0;
8261
8262onError:
8263 Py_XDECREF(errorHandler);
8264 Py_XDECREF(exc);
8265 return -1;
8266}
8267
8268static int
8269charmap_decode_mapping(const char *s,
8270 Py_ssize_t size,
8271 PyObject *mapping,
8272 const char *errors,
8273 _PyUnicodeWriter *writer)
8274{
8275 const char *starts = s;
8276 const char *e;
8277 Py_ssize_t startinpos, endinpos;
8278 PyObject *errorHandler = NULL, *exc = NULL;
8279 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008280 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008281
8282 e = s + size;
8283
8284 while (s < e) {
8285 ch = *s;
8286
8287 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8288 key = PyLong_FromLong((long)ch);
8289 if (key == NULL)
8290 goto onError;
8291
8292 item = PyObject_GetItem(mapping, key);
8293 Py_DECREF(key);
8294 if (item == NULL) {
8295 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8296 /* No mapping found means: mapping is undefined. */
8297 PyErr_Clear();
8298 goto Undefined;
8299 } else
8300 goto onError;
8301 }
8302
8303 /* Apply mapping */
8304 if (item == Py_None)
8305 goto Undefined;
8306 if (PyLong_Check(item)) {
8307 long value = PyLong_AS_LONG(item);
8308 if (value == 0xFFFE)
8309 goto Undefined;
8310 if (value < 0 || value > MAX_UNICODE) {
8311 PyErr_Format(PyExc_TypeError,
Max Bernstein36353882020-10-17 13:38:21 -07008312 "character mapping must be in range(0x%x)",
Victor Stinnerfb161b12013-04-18 01:44:27 +02008313 (unsigned long)MAX_UNICODE + 1);
8314 goto onError;
8315 }
8316
8317 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8318 goto onError;
8319 }
8320 else if (PyUnicode_Check(item)) {
8321 if (PyUnicode_READY(item) == -1)
8322 goto onError;
8323 if (PyUnicode_GET_LENGTH(item) == 1) {
8324 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8325 if (value == 0xFFFE)
8326 goto Undefined;
8327 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8328 goto onError;
8329 }
8330 else {
8331 writer->overallocate = 1;
8332 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8333 goto onError;
8334 }
8335 }
8336 else {
8337 /* wrong return value */
8338 PyErr_SetString(PyExc_TypeError,
8339 "character mapping must return integer, None or str");
8340 goto onError;
8341 }
8342 Py_CLEAR(item);
8343 ++s;
8344 continue;
8345
8346Undefined:
8347 /* undefined mapping */
8348 Py_CLEAR(item);
8349 startinpos = s-starts;
8350 endinpos = startinpos+1;
8351 if (unicode_decode_call_errorhandler_writer(
8352 errors, &errorHandler,
8353 "charmap", "character maps to <undefined>",
8354 &starts, &e, &startinpos, &endinpos, &exc, &s,
8355 writer)) {
8356 goto onError;
8357 }
8358 }
8359 Py_XDECREF(errorHandler);
8360 Py_XDECREF(exc);
8361 return 0;
8362
8363onError:
8364 Py_XDECREF(item);
8365 Py_XDECREF(errorHandler);
8366 Py_XDECREF(exc);
8367 return -1;
8368}
8369
Alexander Belopolsky40018472011-02-26 01:02:56 +00008370PyObject *
8371PyUnicode_DecodeCharmap(const char *s,
8372 Py_ssize_t size,
8373 PyObject *mapping,
8374 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008376 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008377
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 /* Default to Latin-1 */
8379 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008383 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008384 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008385 writer.min_length = size;
8386 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008388
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008389 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008390 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8391 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008392 }
8393 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008394 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8395 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008397 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008398
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008400 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401 return NULL;
8402}
8403
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008404/* Charmap encoding: the lookup table */
8405
Alexander Belopolsky40018472011-02-26 01:02:56 +00008406struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 PyObject_HEAD
8408 unsigned char level1[32];
8409 int count2, count3;
8410 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008411};
8412
8413static PyObject*
8414encoding_map_size(PyObject *obj, PyObject* args)
8415{
8416 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008417 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008419}
8420
8421static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008422 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 PyDoc_STR("Return the size (in bytes) of this object") },
8424 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008425};
8426
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008427static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008428 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 "EncodingMap", /*tp_name*/
8430 sizeof(struct encoding_map), /*tp_basicsize*/
8431 0, /*tp_itemsize*/
8432 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008433 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008434 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 0, /*tp_getattr*/
8436 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008437 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 0, /*tp_repr*/
8439 0, /*tp_as_number*/
8440 0, /*tp_as_sequence*/
8441 0, /*tp_as_mapping*/
8442 0, /*tp_hash*/
8443 0, /*tp_call*/
8444 0, /*tp_str*/
8445 0, /*tp_getattro*/
8446 0, /*tp_setattro*/
8447 0, /*tp_as_buffer*/
8448 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8449 0, /*tp_doc*/
8450 0, /*tp_traverse*/
8451 0, /*tp_clear*/
8452 0, /*tp_richcompare*/
8453 0, /*tp_weaklistoffset*/
8454 0, /*tp_iter*/
8455 0, /*tp_iternext*/
8456 encoding_map_methods, /*tp_methods*/
8457 0, /*tp_members*/
8458 0, /*tp_getset*/
8459 0, /*tp_base*/
8460 0, /*tp_dict*/
8461 0, /*tp_descr_get*/
8462 0, /*tp_descr_set*/
8463 0, /*tp_dictoffset*/
8464 0, /*tp_init*/
8465 0, /*tp_alloc*/
8466 0, /*tp_new*/
8467 0, /*tp_free*/
8468 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008469};
8470
8471PyObject*
8472PyUnicode_BuildEncodingMap(PyObject* string)
8473{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008474 PyObject *result;
8475 struct encoding_map *mresult;
8476 int i;
8477 int need_dict = 0;
8478 unsigned char level1[32];
8479 unsigned char level2[512];
8480 unsigned char *mlevel1, *mlevel2, *mlevel3;
8481 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008483 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008484 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008486
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008487 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008488 PyErr_BadArgument();
8489 return NULL;
8490 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 kind = PyUnicode_KIND(string);
8492 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008493 length = PyUnicode_GET_LENGTH(string);
8494 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008495 memset(level1, 0xFF, sizeof level1);
8496 memset(level2, 0xFF, sizeof level2);
8497
8498 /* If there isn't a one-to-one mapping of NULL to \0,
8499 or if there are non-BMP characters, we need to use
8500 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008502 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008503 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008504 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 ch = PyUnicode_READ(kind, data, i);
8506 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008507 need_dict = 1;
8508 break;
8509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008511 /* unmapped character */
8512 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 l1 = ch >> 11;
8514 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008515 if (level1[l1] == 0xFF)
8516 level1[l1] = count2++;
8517 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008518 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008519 }
8520
8521 if (count2 >= 0xFF || count3 >= 0xFF)
8522 need_dict = 1;
8523
8524 if (need_dict) {
8525 PyObject *result = PyDict_New();
8526 PyObject *key, *value;
8527 if (!result)
8528 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008529 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008531 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008532 if (!key || !value)
8533 goto failed1;
8534 if (PyDict_SetItem(result, key, value) == -1)
8535 goto failed1;
8536 Py_DECREF(key);
8537 Py_DECREF(value);
8538 }
8539 return result;
8540 failed1:
8541 Py_XDECREF(key);
8542 Py_XDECREF(value);
8543 Py_DECREF(result);
8544 return NULL;
8545 }
8546
8547 /* Create a three-level trie */
Victor Stinner32bd68c2020-12-01 10:37:39 +01008548 result = PyObject_Malloc(sizeof(struct encoding_map) +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008549 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008550 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008551 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008552 }
8553
8554 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008555 mresult = (struct encoding_map*)result;
8556 mresult->count2 = count2;
8557 mresult->count3 = count3;
8558 mlevel1 = mresult->level1;
8559 mlevel2 = mresult->level23;
8560 mlevel3 = mresult->level23 + 16*count2;
8561 memcpy(mlevel1, level1, 32);
8562 memset(mlevel2, 0xFF, 16*count2);
8563 memset(mlevel3, 0, 128*count3);
8564 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008565 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008566 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008567 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8568 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008569 /* unmapped character */
8570 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008571 o1 = ch>>11;
8572 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008573 i2 = 16*mlevel1[o1] + o2;
8574 if (mlevel2[i2] == 0xFF)
8575 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008576 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008577 i3 = 128*mlevel2[i2] + o3;
8578 mlevel3[i3] = i;
8579 }
8580 return result;
8581}
8582
8583static int
Victor Stinner22168992011-11-20 17:09:18 +01008584encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008585{
8586 struct encoding_map *map = (struct encoding_map*)mapping;
8587 int l1 = c>>11;
8588 int l2 = (c>>7) & 0xF;
8589 int l3 = c & 0x7F;
8590 int i;
8591
Victor Stinner22168992011-11-20 17:09:18 +01008592 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008594 if (c == 0)
8595 return 0;
8596 /* level 1*/
8597 i = map->level1[l1];
8598 if (i == 0xFF) {
8599 return -1;
8600 }
8601 /* level 2*/
8602 i = map->level23[16*i+l2];
8603 if (i == 0xFF) {
8604 return -1;
8605 }
8606 /* level 3 */
8607 i = map->level23[16*map->count2 + 128*i + l3];
8608 if (i == 0) {
8609 return -1;
8610 }
8611 return i;
8612}
8613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008614/* Lookup the character ch in the mapping. If the character
8615 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008616 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008617static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008618charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619{
Christian Heimes217cfd12007-12-02 14:31:20 +00008620 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008621 PyObject *x;
8622
8623 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 x = PyObject_GetItem(mapping, w);
8626 Py_DECREF(w);
8627 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8629 /* No mapping found means: mapping is undefined. */
8630 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008631 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 } else
8633 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008635 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008637 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 long value = PyLong_AS_LONG(x);
8639 if (value < 0 || value > 255) {
8640 PyErr_SetString(PyExc_TypeError,
8641 "character mapping must be in range(256)");
8642 Py_DECREF(x);
8643 return NULL;
8644 }
8645 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008647 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 /* wrong return value */
8651 PyErr_Format(PyExc_TypeError,
8652 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008653 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 Py_DECREF(x);
8655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 }
8657}
8658
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008659static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008660charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008661{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008662 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8663 /* exponentially overallocate to minimize reallocations */
8664 if (requiredsize < 2*outsize)
8665 requiredsize = 2*outsize;
8666 if (_PyBytes_Resize(outobj, requiredsize))
8667 return -1;
8668 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008669}
8670
Benjamin Peterson14339b62009-01-31 16:36:08 +00008671typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008673} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008675 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 space is available. Return a new reference to the object that
8677 was put in the output buffer, or Py_None, if the mapping was undefined
8678 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008679 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008680static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008681charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008682 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008684 PyObject *rep;
8685 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008686 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687
Andy Lesterdffe4c02020-03-04 07:15:20 -06008688 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008689 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008691 if (res == -1)
8692 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 if (outsize<requiredsize)
8694 if (charmapencode_resize(outobj, outpos, requiredsize))
8695 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008696 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 outstart[(*outpos)++] = (char)res;
8698 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008699 }
8700
8701 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008704 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 Py_DECREF(rep);
8706 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008707 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 if (PyLong_Check(rep)) {
8709 Py_ssize_t requiredsize = *outpos+1;
8710 if (outsize<requiredsize)
8711 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8712 Py_DECREF(rep);
8713 return enc_EXCEPTION;
8714 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008715 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008717 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 else {
8719 const char *repchars = PyBytes_AS_STRING(rep);
8720 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8721 Py_ssize_t requiredsize = *outpos+repsize;
8722 if (outsize<requiredsize)
8723 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8724 Py_DECREF(rep);
8725 return enc_EXCEPTION;
8726 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008727 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 memcpy(outstart + *outpos, repchars, repsize);
8729 *outpos += repsize;
8730 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008731 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008732 Py_DECREF(rep);
8733 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008734}
8735
8736/* handle an error in PyUnicode_EncodeCharmap
8737 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008738static int
8739charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008740 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008742 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008743 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008744{
8745 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008746 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008747 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008748 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008749 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008750 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008751 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008752 Py_ssize_t collstartpos = *inpos;
8753 Py_ssize_t collendpos = *inpos+1;
8754 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008755 const char *encoding = "charmap";
8756 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008757 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008758 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008759 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760
Benjamin Petersonbac79492012-01-14 13:34:47 -05008761 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008762 return -1;
8763 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764 /* find all unencodable characters */
8765 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008766 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008767 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008768 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008769 val = encoding_map_lookup(ch, mapping);
8770 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 break;
8772 ++collendpos;
8773 continue;
8774 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008775
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008776 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8777 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 if (rep==NULL)
8779 return -1;
8780 else if (rep!=Py_None) {
8781 Py_DECREF(rep);
8782 break;
8783 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008784 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008786 }
8787 /* cache callback name lookup
8788 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008789 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008790 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008791
8792 switch (*error_handler) {
8793 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008794 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008795 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008796
8797 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008798 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008799 x = charmapencode_output('?', mapping, res, respos);
8800 if (x==enc_EXCEPTION) {
8801 return -1;
8802 }
8803 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008804 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 return -1;
8806 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008807 }
8808 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008809 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008810 *inpos = collendpos;
8811 break;
Victor Stinner50149202015-09-22 00:26:54 +02008812
8813 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008814 /* generate replacement (temporarily (mis)uses p) */
8815 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 char buffer[2+29+1+1];
8817 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008818 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008819 for (cp = buffer; *cp; ++cp) {
8820 x = charmapencode_output(*cp, mapping, res, respos);
8821 if (x==enc_EXCEPTION)
8822 return -1;
8823 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008824 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 return -1;
8826 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008827 }
8828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008829 *inpos = collendpos;
8830 break;
Victor Stinner50149202015-09-22 00:26:54 +02008831
Benjamin Peterson14339b62009-01-31 16:36:08 +00008832 default:
Victor Stinner50149202015-09-22 00:26:54 +02008833 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008834 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008836 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008838 if (PyBytes_Check(repunicode)) {
8839 /* Directly copy bytes result to output. */
8840 Py_ssize_t outsize = PyBytes_Size(*res);
8841 Py_ssize_t requiredsize;
8842 repsize = PyBytes_Size(repunicode);
8843 requiredsize = *respos + repsize;
8844 if (requiredsize > outsize)
8845 /* Make room for all additional bytes. */
8846 if (charmapencode_resize(res, respos, requiredsize)) {
8847 Py_DECREF(repunicode);
8848 return -1;
8849 }
8850 memcpy(PyBytes_AsString(*res) + *respos,
8851 PyBytes_AsString(repunicode), repsize);
8852 *respos += repsize;
8853 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008854 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008855 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008856 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008857 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008858 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008859 Py_DECREF(repunicode);
8860 return -1;
8861 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008862 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008863 data = PyUnicode_DATA(repunicode);
8864 kind = PyUnicode_KIND(repunicode);
8865 for (index = 0; index < repsize; index++) {
8866 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8867 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008868 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008869 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 return -1;
8871 }
8872 else if (x==enc_FAILED) {
8873 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008874 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 return -1;
8876 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008877 }
8878 *inpos = newpos;
8879 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008880 }
8881 return 0;
8882}
8883
Alexander Belopolsky40018472011-02-26 01:02:56 +00008884PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008885_PyUnicode_EncodeCharmap(PyObject *unicode,
8886 PyObject *mapping,
8887 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008889 /* output object */
8890 PyObject *res = NULL;
8891 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008892 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008893 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008894 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008895 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008896 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008897 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008898 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008899 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008900 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901
Benjamin Petersonbac79492012-01-14 13:34:47 -05008902 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008903 return NULL;
8904 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008905 data = PyUnicode_DATA(unicode);
8906 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008907
Guido van Rossumd57fd912000-03-10 22:53:23 +00008908 /* Default to Latin-1 */
8909 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008910 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008912 /* allocate enough for a simple encoding without
8913 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008914 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008915 if (res == NULL)
8916 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008917 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008920 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008921 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008923 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 if (x==enc_EXCEPTION) /* error */
8925 goto onError;
8926 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008927 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008928 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008929 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 &res, &respos)) {
8931 goto onError;
8932 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008933 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 else
8935 /* done with this character => adjust input position */
8936 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008939 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008940 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008941 if (_PyBytes_Resize(&res, respos) < 0)
8942 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008943
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008944 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008945 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008946 return res;
8947
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008949 Py_XDECREF(res);
8950 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008951 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952 return NULL;
8953}
8954
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008955/* Deprecated */
8956PyObject *
8957PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8958 Py_ssize_t size,
8959 PyObject *mapping,
8960 const char *errors)
8961{
8962 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008963 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008964 if (unicode == NULL)
8965 return NULL;
8966 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8967 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008968 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008969}
8970
Alexander Belopolsky40018472011-02-26 01:02:56 +00008971PyObject *
8972PyUnicode_AsCharmapString(PyObject *unicode,
8973 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974{
8975 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008976 PyErr_BadArgument();
8977 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008979 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980}
8981
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008982/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008983static void
8984make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008986 Py_ssize_t startpos, Py_ssize_t endpos,
8987 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008989 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 *exceptionObject = _PyUnicodeTranslateError_Create(
8991 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992 }
8993 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8995 goto onError;
8996 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8997 goto onError;
8998 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8999 goto onError;
9000 return;
9001 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02009002 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003 }
9004}
9005
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009006/* error handling callback helper:
9007 build arguments, call the callback and check the arguments,
9008 put the result into newpos and return the replacement string, which
9009 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009010static PyObject *
9011unicode_translate_call_errorhandler(const char *errors,
9012 PyObject **errorHandler,
9013 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009015 Py_ssize_t startpos, Py_ssize_t endpos,
9016 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009017{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009018 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009019
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009020 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009021 PyObject *restuple;
9022 PyObject *resunicode;
9023
9024 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009026 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009027 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009028 }
9029
9030 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009032 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009033 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009034
Petr Viktorinffd97532020-02-11 17:46:57 +01009035 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009036 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009037 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009038 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009039 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 Py_DECREF(restuple);
9041 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009042 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009043 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 &resunicode, &i_newpos)) {
9045 Py_DECREF(restuple);
9046 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009047 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009048 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009050 else
9051 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009053 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009054 Py_DECREF(restuple);
9055 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009056 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009057 Py_INCREF(resunicode);
9058 Py_DECREF(restuple);
9059 return resunicode;
9060}
9061
9062/* Lookup the character ch in the mapping and put the result in result,
9063 which must be decrefed by the caller.
9064 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009065static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009067{
Christian Heimes217cfd12007-12-02 14:31:20 +00009068 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009069 PyObject *x;
9070
9071 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009072 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009073 x = PyObject_GetItem(mapping, w);
9074 Py_DECREF(w);
9075 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009076 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9077 /* No mapping found means: use 1:1 mapping. */
9078 PyErr_Clear();
9079 *result = NULL;
9080 return 0;
9081 } else
9082 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009083 }
9084 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 *result = x;
9086 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009087 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009088 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009090 if (value < 0 || value > MAX_UNICODE) {
9091 PyErr_Format(PyExc_ValueError,
9092 "character mapping must be in range(0x%x)",
9093 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 Py_DECREF(x);
9095 return -1;
9096 }
9097 *result = x;
9098 return 0;
9099 }
9100 else if (PyUnicode_Check(x)) {
9101 *result = x;
9102 return 0;
9103 }
9104 else {
9105 /* wrong return value */
9106 PyErr_SetString(PyExc_TypeError,
9107 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009108 Py_DECREF(x);
9109 return -1;
9110 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009111}
Victor Stinner1194ea02014-04-04 19:37:40 +02009112
9113/* lookup the character, write the result into the writer.
9114 Return 1 if the result was written into the writer, return 0 if the mapping
9115 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009116static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009117charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9118 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009119{
Victor Stinner1194ea02014-04-04 19:37:40 +02009120 PyObject *item;
9121
9122 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009124
9125 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009127 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009130 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009131 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009132
9133 if (item == Py_None) {
9134 Py_DECREF(item);
9135 return 0;
9136 }
9137
9138 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009139 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9140 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9141 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009142 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9143 Py_DECREF(item);
9144 return -1;
9145 }
9146 Py_DECREF(item);
9147 return 1;
9148 }
9149
9150 if (!PyUnicode_Check(item)) {
9151 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009153 }
9154
9155 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9156 Py_DECREF(item);
9157 return -1;
9158 }
9159
9160 Py_DECREF(item);
9161 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009162}
9163
Victor Stinner89a76ab2014-04-05 11:44:04 +02009164static int
9165unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9166 Py_UCS1 *translate)
9167{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009168 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009169 int ret = 0;
9170
Victor Stinner89a76ab2014-04-05 11:44:04 +02009171 if (charmaptranslate_lookup(ch, mapping, &item)) {
9172 return -1;
9173 }
9174
9175 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009176 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009177 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009178 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009179 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009180 /* not found => default to 1:1 mapping */
9181 translate[ch] = ch;
9182 return 1;
9183 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009184 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009185 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009186 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9187 used it */
9188 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009189 /* invalid character or character outside ASCII:
9190 skip the fast translate */
9191 goto exit;
9192 }
9193 translate[ch] = (Py_UCS1)replace;
9194 }
9195 else if (PyUnicode_Check(item)) {
9196 Py_UCS4 replace;
9197
9198 if (PyUnicode_READY(item) == -1) {
9199 Py_DECREF(item);
9200 return -1;
9201 }
9202 if (PyUnicode_GET_LENGTH(item) != 1)
9203 goto exit;
9204
9205 replace = PyUnicode_READ_CHAR(item, 0);
9206 if (replace > 127)
9207 goto exit;
9208 translate[ch] = (Py_UCS1)replace;
9209 }
9210 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009211 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009212 goto exit;
9213 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009214 ret = 1;
9215
Benjamin Peterson1365de72014-04-07 20:15:41 -04009216 exit:
9217 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009218 return ret;
9219}
9220
9221/* Fast path for ascii => ascii translation. Return 1 if the whole string
9222 was translated into writer, return 0 if the input string was partially
9223 translated into writer, raise an exception and return -1 on error. */
9224static int
9225unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009226 _PyUnicodeWriter *writer, int ignore,
9227 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009228{
Victor Stinner872b2912014-04-05 14:27:07 +02009229 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009230 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009231 const Py_UCS1 *in, *end;
9232 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009233 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009234
Victor Stinner89a76ab2014-04-05 11:44:04 +02009235 len = PyUnicode_GET_LENGTH(input);
9236
Victor Stinner872b2912014-04-05 14:27:07 +02009237 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009238
9239 in = PyUnicode_1BYTE_DATA(input);
9240 end = in + len;
9241
9242 assert(PyUnicode_IS_ASCII(writer->buffer));
9243 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9244 out = PyUnicode_1BYTE_DATA(writer->buffer);
9245
Victor Stinner872b2912014-04-05 14:27:07 +02009246 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009247 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009248 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009249 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009250 int translate = unicode_fast_translate_lookup(mapping, ch,
9251 ascii_table);
9252 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009253 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009254 if (translate == 0)
9255 goto exit;
9256 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009257 }
Victor Stinner872b2912014-04-05 14:27:07 +02009258 if (ch2 == 0xfe) {
9259 if (ignore)
9260 continue;
9261 goto exit;
9262 }
9263 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009264 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009265 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009266 }
Victor Stinner872b2912014-04-05 14:27:07 +02009267 res = 1;
9268
9269exit:
9270 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009271 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009272 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009273}
9274
Victor Stinner3222da22015-10-01 22:07:32 +02009275static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276_PyUnicode_TranslateCharmap(PyObject *input,
9277 PyObject *mapping,
9278 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009281 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282 Py_ssize_t size, i;
9283 int kind;
9284 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009285 _PyUnicodeWriter writer;
9286 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009287 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009288 PyObject *errorHandler = NULL;
9289 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009290 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009291 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009292
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009294 PyErr_BadArgument();
9295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 if (PyUnicode_READY(input) == -1)
9299 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009300 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 kind = PyUnicode_KIND(input);
9302 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009304 if (size == 0)
9305 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009307 /* allocate enough for a simple 1:1 translation without
9308 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009309 _PyUnicodeWriter_Init(&writer);
9310 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009311 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312
Victor Stinner872b2912014-04-05 14:27:07 +02009313 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9314
Victor Stinner33798672016-03-01 21:59:58 +01009315 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009316 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009317 if (PyUnicode_IS_ASCII(input)) {
9318 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9319 if (res < 0) {
9320 _PyUnicodeWriter_Dealloc(&writer);
9321 return NULL;
9322 }
9323 if (res == 1)
9324 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009325 }
Victor Stinner33798672016-03-01 21:59:58 +01009326 else {
9327 i = 0;
9328 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009331 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009332 int translate;
9333 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9334 Py_ssize_t newpos;
9335 /* startpos for collecting untranslatable chars */
9336 Py_ssize_t collstart;
9337 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009338 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339
Victor Stinner1194ea02014-04-04 19:37:40 +02009340 ch = PyUnicode_READ(kind, data, i);
9341 translate = charmaptranslate_output(ch, mapping, &writer);
9342 if (translate < 0)
9343 goto onError;
9344
9345 if (translate != 0) {
9346 /* it worked => adjust input pointer */
9347 ++i;
9348 continue;
9349 }
9350
9351 /* untranslatable character */
9352 collstart = i;
9353 collend = i+1;
9354
9355 /* find all untranslatable characters */
9356 while (collend < size) {
9357 PyObject *x;
9358 ch = PyUnicode_READ(kind, data, collend);
9359 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009360 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009361 Py_XDECREF(x);
9362 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009363 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009364 ++collend;
9365 }
9366
9367 if (ignore) {
9368 i = collend;
9369 }
9370 else {
9371 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9372 reason, input, &exc,
9373 collstart, collend, &newpos);
9374 if (repunicode == NULL)
9375 goto onError;
9376 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009377 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009378 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009379 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009380 Py_DECREF(repunicode);
9381 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009382 }
9383 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009384 Py_XDECREF(exc);
9385 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009386 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009389 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009390 Py_XDECREF(exc);
9391 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 return NULL;
9393}
9394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395/* Deprecated. Use PyUnicode_Translate instead. */
9396PyObject *
9397PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9398 Py_ssize_t size,
9399 PyObject *mapping,
9400 const char *errors)
9401{
Christian Heimes5f520f42012-09-11 14:03:25 +02009402 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009403 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 if (!unicode)
9405 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009406 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9407 Py_DECREF(unicode);
9408 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409}
9410
Alexander Belopolsky40018472011-02-26 01:02:56 +00009411PyObject *
9412PyUnicode_Translate(PyObject *str,
9413 PyObject *mapping,
9414 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009416 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009417 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009418 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419}
Tim Petersced69f82003-09-16 20:30:58 +00009420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421PyObject *
9422_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9423{
9424 if (!PyUnicode_Check(unicode)) {
9425 PyErr_BadInternalCall();
9426 return NULL;
9427 }
9428 if (PyUnicode_READY(unicode) == -1)
9429 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009430 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 /* If the string is already ASCII, just return the same string */
9432 Py_INCREF(unicode);
9433 return unicode;
9434 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009435
9436 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9437 PyObject *result = PyUnicode_New(len, 127);
9438 if (result == NULL) {
9439 return NULL;
9440 }
9441
9442 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9443 int kind = PyUnicode_KIND(unicode);
9444 const void *data = PyUnicode_DATA(unicode);
9445 Py_ssize_t i;
9446 for (i = 0; i < len; ++i) {
9447 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9448 if (ch < 127) {
9449 out[i] = ch;
9450 }
9451 else if (Py_UNICODE_ISSPACE(ch)) {
9452 out[i] = ' ';
9453 }
9454 else {
9455 int decimal = Py_UNICODE_TODECIMAL(ch);
9456 if (decimal < 0) {
9457 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009458 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009459 _PyUnicode_LENGTH(result) = i + 1;
9460 break;
9461 }
9462 out[i] = '0' + decimal;
9463 }
9464 }
9465
INADA Naoki16dfca42018-07-14 12:06:43 +09009466 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009467 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468}
9469
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009470PyObject *
9471PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9472 Py_ssize_t length)
9473{
Victor Stinnerf0124502011-11-21 23:12:56 +01009474 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009475 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009476 Py_UCS4 maxchar;
9477 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009478 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009479
Victor Stinner99d7ad02012-02-22 13:37:39 +01009480 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009481 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009482 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009483 if (ch > 127) {
9484 int decimal = Py_UNICODE_TODECIMAL(ch);
9485 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009486 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009487 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009488 }
9489 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009490
9491 /* Copy to a new string */
9492 decimal = PyUnicode_New(length, maxchar);
9493 if (decimal == NULL)
9494 return decimal;
9495 kind = PyUnicode_KIND(decimal);
9496 data = PyUnicode_DATA(decimal);
9497 /* Iterate over code points */
9498 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009499 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009500 if (ch > 127) {
9501 int decimal = Py_UNICODE_TODECIMAL(ch);
9502 if (decimal >= 0)
9503 ch = '0' + decimal;
9504 }
9505 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009507 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009508}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009509/* --- Decimal Encoder ---------------------------------------------------- */
9510
Alexander Belopolsky40018472011-02-26 01:02:56 +00009511int
9512PyUnicode_EncodeDecimal(Py_UNICODE *s,
9513 Py_ssize_t length,
9514 char *output,
9515 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009516{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009517 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009518 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009519 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009520 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009521
9522 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009523 PyErr_BadArgument();
9524 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009525 }
9526
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009527 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009528 if (unicode == NULL)
9529 return -1;
9530
Victor Stinner42bf7752011-11-21 22:52:58 +01009531 kind = PyUnicode_KIND(unicode);
9532 data = PyUnicode_DATA(unicode);
9533
Victor Stinnerb84d7232011-11-22 01:50:07 +01009534 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009535 PyObject *exc;
9536 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009538 Py_ssize_t startpos;
9539
9540 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009541
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009543 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009544 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009545 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009546 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 decimal = Py_UNICODE_TODECIMAL(ch);
9548 if (decimal >= 0) {
9549 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009550 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 continue;
9552 }
9553 if (0 < ch && ch < 256) {
9554 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009555 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009556 continue;
9557 }
Victor Stinner6345be92011-11-25 20:09:01 +01009558
Victor Stinner42bf7752011-11-21 22:52:58 +01009559 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009560 exc = NULL;
9561 raise_encode_exception(&exc, "decimal", unicode,
9562 startpos, startpos+1,
9563 "invalid decimal Unicode string");
9564 Py_XDECREF(exc);
9565 Py_DECREF(unicode);
9566 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009567 }
9568 /* 0-terminate the output string */
9569 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009570 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009571 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009572}
9573
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574/* --- Helpers ------------------------------------------------------------ */
9575
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009576/* helper macro to fixup start/end slice values */
9577#define ADJUST_INDICES(start, end, len) \
9578 if (end > len) \
9579 end = len; \
9580 else if (end < 0) { \
9581 end += len; \
9582 if (end < 0) \
9583 end = 0; \
9584 } \
9585 if (start < 0) { \
9586 start += len; \
9587 if (start < 0) \
9588 start = 0; \
9589 }
9590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009592any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009594 Py_ssize_t end,
9595 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009597 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009598 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 Py_ssize_t len1, len2, result;
9600
9601 kind1 = PyUnicode_KIND(s1);
9602 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009603 if (kind1 < kind2)
9604 return -1;
9605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 len1 = PyUnicode_GET_LENGTH(s1);
9607 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009608 ADJUST_INDICES(start, end, len1);
9609 if (end - start < len2)
9610 return -1;
9611
9612 buf1 = PyUnicode_DATA(s1);
9613 buf2 = PyUnicode_DATA(s2);
9614 if (len2 == 1) {
9615 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9616 result = findchar((const char *)buf1 + kind1*start,
9617 kind1, end - start, ch, direction);
9618 if (result == -1)
9619 return -1;
9620 else
9621 return start + result;
9622 }
9623
9624 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009625 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009626 if (!buf2)
9627 return -2;
9628 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629
Victor Stinner794d5672011-10-10 03:21:36 +02009630 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009631 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009632 case PyUnicode_1BYTE_KIND:
9633 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9634 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9635 else
9636 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9637 break;
9638 case PyUnicode_2BYTE_KIND:
9639 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9640 break;
9641 case PyUnicode_4BYTE_KIND:
9642 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9643 break;
9644 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009645 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009646 }
9647 }
9648 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009649 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009650 case PyUnicode_1BYTE_KIND:
9651 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9652 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9653 else
9654 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9655 break;
9656 case PyUnicode_2BYTE_KIND:
9657 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9658 break;
9659 case PyUnicode_4BYTE_KIND:
9660 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9661 break;
9662 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009663 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 }
9666
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009667 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009668 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009669 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670
9671 return result;
9672}
9673
Victor Stinner59423e32018-11-26 13:40:01 +01009674/* _PyUnicode_InsertThousandsGrouping() helper functions */
9675#include "stringlib/localeutil.h"
9676
9677/**
9678 * InsertThousandsGrouping:
9679 * @writer: Unicode writer.
9680 * @n_buffer: Number of characters in @buffer.
9681 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9682 * @d_pos: Start of digits string.
9683 * @n_digits: The number of digits in the string, in which we want
9684 * to put the grouping chars.
9685 * @min_width: The minimum width of the digits in the output string.
9686 * Output will be zero-padded on the left to fill.
9687 * @grouping: see definition in localeconv().
9688 * @thousands_sep: see definition in localeconv().
9689 *
9690 * There are 2 modes: counting and filling. If @writer is NULL,
9691 * we are in counting mode, else filling mode.
9692 * If counting, the required buffer size is returned.
9693 * If filling, we know the buffer will be large enough, so we don't
9694 * need to pass in the buffer size.
9695 * Inserts thousand grouping characters (as defined by grouping and
9696 * thousands_sep) into @writer.
9697 *
9698 * Return value: -1 on error, number of characters otherwise.
9699 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009701_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009702 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009703 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009704 PyObject *digits,
9705 Py_ssize_t d_pos,
9706 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009707 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009708 const char *grouping,
9709 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009710 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711{
Xtreak3f7983a2019-01-07 20:39:14 +05309712 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009713 if (writer) {
9714 assert(digits != NULL);
9715 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009716 }
9717 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009718 assert(digits == NULL);
9719 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009720 }
Victor Stinner59423e32018-11-26 13:40:01 +01009721 assert(0 <= d_pos);
9722 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009723 assert(grouping != NULL);
9724
9725 if (digits != NULL) {
9726 if (PyUnicode_READY(digits) == -1) {
9727 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009728 }
Victor Stinner59423e32018-11-26 13:40:01 +01009729 }
9730 if (PyUnicode_READY(thousands_sep) == -1) {
9731 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009732 }
9733
Victor Stinner59423e32018-11-26 13:40:01 +01009734 Py_ssize_t count = 0;
9735 Py_ssize_t n_zeros;
9736 int loop_broken = 0;
9737 int use_separator = 0; /* First time through, don't append the
9738 separator. They only go between
9739 groups. */
9740 Py_ssize_t buffer_pos;
9741 Py_ssize_t digits_pos;
9742 Py_ssize_t len;
9743 Py_ssize_t n_chars;
9744 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9745 be looked at */
9746 /* A generator that returns all of the grouping widths, until it
9747 returns 0. */
9748 GroupGenerator groupgen;
9749 GroupGenerator_init(&groupgen, grouping);
9750 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9751
9752 /* if digits are not grouped, thousands separator
9753 should be an empty string */
9754 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9755
9756 digits_pos = d_pos + n_digits;
9757 if (writer) {
9758 buffer_pos = writer->pos + n_buffer;
9759 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9760 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 }
Victor Stinner59423e32018-11-26 13:40:01 +01009762 else {
9763 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009764 }
Victor Stinner59423e32018-11-26 13:40:01 +01009765
9766 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009767 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009768 }
Victor Stinner59423e32018-11-26 13:40:01 +01009769
9770 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9771 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9772 n_zeros = Py_MAX(0, len - remaining);
9773 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9774
9775 /* Use n_zero zero's and n_chars chars */
9776
9777 /* Count only, don't do anything. */
9778 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9779
9780 /* Copy into the writer. */
9781 InsertThousandsGrouping_fill(writer, &buffer_pos,
9782 digits, &digits_pos,
9783 n_chars, n_zeros,
9784 use_separator ? thousands_sep : NULL,
9785 thousands_sep_len, maxchar);
9786
9787 /* Use a separator next time. */
9788 use_separator = 1;
9789
9790 remaining -= n_chars;
9791 min_width -= len;
9792
9793 if (remaining <= 0 && min_width <= 0) {
9794 loop_broken = 1;
9795 break;
9796 }
9797 min_width -= thousands_sep_len;
9798 }
9799 if (!loop_broken) {
9800 /* We left the loop without using a break statement. */
9801
9802 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9803 n_zeros = Py_MAX(0, len - remaining);
9804 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9805
9806 /* Use n_zero zero's and n_chars chars */
9807 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9808
9809 /* Copy into the writer. */
9810 InsertThousandsGrouping_fill(writer, &buffer_pos,
9811 digits, &digits_pos,
9812 n_chars, n_zeros,
9813 use_separator ? thousands_sep : NULL,
9814 thousands_sep_len, maxchar);
9815 }
9816 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817}
9818
9819
Alexander Belopolsky40018472011-02-26 01:02:56 +00009820Py_ssize_t
9821PyUnicode_Count(PyObject *str,
9822 PyObject *substr,
9823 Py_ssize_t start,
9824 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009825{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009826 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009827 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009828 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009830
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009831 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009832 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009833
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009834 kind1 = PyUnicode_KIND(str);
9835 kind2 = PyUnicode_KIND(substr);
9836 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009837 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009838
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009839 len1 = PyUnicode_GET_LENGTH(str);
9840 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009842 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009843 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009844
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009845 buf1 = PyUnicode_DATA(str);
9846 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009847 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009848 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009849 if (!buf2)
9850 goto onError;
9851 }
9852
9853 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009855 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009856 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009857 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009858 buf2, len2, PY_SSIZE_T_MAX
9859 );
9860 else
9861 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009862 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009863 buf2, len2, PY_SSIZE_T_MAX
9864 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 break;
9866 case PyUnicode_2BYTE_KIND:
9867 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009868 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 buf2, len2, PY_SSIZE_T_MAX
9870 );
9871 break;
9872 case PyUnicode_4BYTE_KIND:
9873 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009874 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 buf2, len2, PY_SSIZE_T_MAX
9876 );
9877 break;
9878 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009879 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009881
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009882 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009883 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009884 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009888 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9889 if (kind2 != kind1)
9890 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892}
9893
Alexander Belopolsky40018472011-02-26 01:02:56 +00009894Py_ssize_t
9895PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009896 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009897 Py_ssize_t start,
9898 Py_ssize_t end,
9899 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009901 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009902 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009903
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009904 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905}
9906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907Py_ssize_t
9908PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9909 Py_ssize_t start, Py_ssize_t end,
9910 int direction)
9911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009913 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 if (PyUnicode_READY(str) == -1)
9915 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009916 len = PyUnicode_GET_LENGTH(str);
9917 ADJUST_INDICES(start, end, len);
9918 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009919 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009921 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9922 kind, end-start, ch, direction);
9923 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009925 else
9926 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927}
9928
Alexander Belopolsky40018472011-02-26 01:02:56 +00009929static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009930tailmatch(PyObject *self,
9931 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009932 Py_ssize_t start,
9933 Py_ssize_t end,
9934 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 int kind_self;
9937 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009938 const void *data_self;
9939 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 Py_ssize_t offset;
9941 Py_ssize_t i;
9942 Py_ssize_t end_sub;
9943
9944 if (PyUnicode_READY(self) == -1 ||
9945 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009946 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9949 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009951 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009953 if (PyUnicode_GET_LENGTH(substring) == 0)
9954 return 1;
9955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 kind_self = PyUnicode_KIND(self);
9957 data_self = PyUnicode_DATA(self);
9958 kind_sub = PyUnicode_KIND(substring);
9959 data_sub = PyUnicode_DATA(substring);
9960 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9961
9962 if (direction > 0)
9963 offset = end;
9964 else
9965 offset = start;
9966
9967 if (PyUnicode_READ(kind_self, data_self, offset) ==
9968 PyUnicode_READ(kind_sub, data_sub, 0) &&
9969 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9970 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9971 /* If both are of the same kind, memcmp is sufficient */
9972 if (kind_self == kind_sub) {
9973 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009974 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 data_sub,
9976 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009977 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009979 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 else {
9981 /* We do not need to compare 0 and len(substring)-1 because
9982 the if statement above ensured already that they are equal
9983 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009984 for (i = 1; i < end_sub; ++i) {
9985 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9986 PyUnicode_READ(kind_sub, data_sub, i))
9987 return 0;
9988 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991 }
9992
9993 return 0;
9994}
9995
Alexander Belopolsky40018472011-02-26 01:02:56 +00009996Py_ssize_t
9997PyUnicode_Tailmatch(PyObject *str,
9998 PyObject *substr,
9999 Py_ssize_t start,
10000 Py_ssize_t end,
10001 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010003 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010004 return -1;
Tim Petersced69f82003-09-16 20:30:58 +000010005
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010006 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007}
10008
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010009static PyObject *
10010ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010012 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010013 const char *data = PyUnicode_DATA(self);
10014 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010015 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +000010016
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010017 res = PyUnicode_New(len, 127);
10018 if (res == NULL)
10019 return NULL;
10020 resdata = PyUnicode_DATA(res);
10021 if (lower)
10022 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010024 _Py_bytes_upper(resdata, data, len);
10025 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026}
10027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010029handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010031 Py_ssize_t j;
10032 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010010033 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010034 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010035
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010036 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10037
10038 where ! is a negation and \p{xxx} is a character with property xxx.
10039 */
10040 for (j = i - 1; j >= 0; j--) {
10041 c = PyUnicode_READ(kind, data, j);
10042 if (!_PyUnicode_IsCaseIgnorable(c))
10043 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010045 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10046 if (final_sigma) {
10047 for (j = i + 1; j < length; j++) {
10048 c = PyUnicode_READ(kind, data, j);
10049 if (!_PyUnicode_IsCaseIgnorable(c))
10050 break;
10051 }
10052 final_sigma = j == length || !_PyUnicode_IsCased(c);
10053 }
10054 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055}
10056
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010057static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010058lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010059 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010061 /* Obscure special case. */
10062 if (c == 0x3A3) {
10063 mapped[0] = handle_capital_sigma(kind, data, length, i);
10064 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010066 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067}
10068
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010069static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010070do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010071{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010072 Py_ssize_t i, k = 0;
10073 int n_res, j;
10074 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010075
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010076 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010077 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010078 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010079 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010080 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010082 for (i = 1; i < length; i++) {
10083 c = PyUnicode_READ(kind, data, i);
10084 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10085 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010086 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010087 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010088 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010089 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010090 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091}
10092
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010093static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010094do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010095 Py_ssize_t i, k = 0;
10096
10097 for (i = 0; i < length; i++) {
10098 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10099 int n_res, j;
10100 if (Py_UNICODE_ISUPPER(c)) {
10101 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10102 }
10103 else if (Py_UNICODE_ISLOWER(c)) {
10104 n_res = _PyUnicode_ToUpperFull(c, mapped);
10105 }
10106 else {
10107 n_res = 1;
10108 mapped[0] = c;
10109 }
10110 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010111 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010112 res[k++] = mapped[j];
10113 }
10114 }
10115 return k;
10116}
10117
10118static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010119do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010120 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010122 Py_ssize_t i, k = 0;
10123
10124 for (i = 0; i < length; i++) {
10125 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10126 int n_res, j;
10127 if (lower)
10128 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10129 else
10130 n_res = _PyUnicode_ToUpperFull(c, mapped);
10131 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010132 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010133 res[k++] = mapped[j];
10134 }
10135 }
10136 return k;
10137}
10138
10139static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010140do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010141{
10142 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10143}
10144
10145static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010146do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010147{
10148 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10149}
10150
Benjamin Petersone51757f2012-01-12 21:10:29 -050010151static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010152do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010153{
10154 Py_ssize_t i, k = 0;
10155
10156 for (i = 0; i < length; i++) {
10157 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10158 Py_UCS4 mapped[3];
10159 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10160 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010161 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010162 res[k++] = mapped[j];
10163 }
10164 }
10165 return k;
10166}
10167
10168static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010169do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010170{
10171 Py_ssize_t i, k = 0;
10172 int previous_is_cased;
10173
10174 previous_is_cased = 0;
10175 for (i = 0; i < length; i++) {
10176 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10177 Py_UCS4 mapped[3];
10178 int n_res, j;
10179
10180 if (previous_is_cased)
10181 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10182 else
10183 n_res = _PyUnicode_ToTitleFull(c, mapped);
10184
10185 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010186 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010187 res[k++] = mapped[j];
10188 }
10189
10190 previous_is_cased = _PyUnicode_IsCased(c);
10191 }
10192 return k;
10193}
10194
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010195static PyObject *
10196case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010197 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010198{
10199 PyObject *res = NULL;
10200 Py_ssize_t length, newlength = 0;
10201 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010202 const void *data;
10203 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010204 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10205
Benjamin Petersoneea48462012-01-16 14:28:50 -050010206 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010207
10208 kind = PyUnicode_KIND(self);
10209 data = PyUnicode_DATA(self);
10210 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010211 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010212 PyErr_SetString(PyExc_OverflowError, "string is too long");
10213 return NULL;
10214 }
Victor Stinner00d7abd2020-12-01 09:56:42 +010010215 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010216 if (tmp == NULL)
10217 return PyErr_NoMemory();
10218 newlength = perform(kind, data, length, tmp, &maxchar);
10219 res = PyUnicode_New(newlength, maxchar);
10220 if (res == NULL)
10221 goto leave;
10222 tmpend = tmp + newlength;
10223 outdata = PyUnicode_DATA(res);
10224 outkind = PyUnicode_KIND(res);
10225 switch (outkind) {
10226 case PyUnicode_1BYTE_KIND:
10227 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10228 break;
10229 case PyUnicode_2BYTE_KIND:
10230 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10231 break;
10232 case PyUnicode_4BYTE_KIND:
10233 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10234 break;
10235 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010236 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010237 }
10238 leave:
Victor Stinner00d7abd2020-12-01 09:56:42 +010010239 PyMem_Free(tmp);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010240 return res;
10241}
10242
Tim Peters8ce9f162004-08-27 01:49:32 +000010243PyObject *
10244PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010246 PyObject *res;
10247 PyObject *fseq;
10248 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010249 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010251 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010252 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010253 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010254 }
10255
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010256 /* NOTE: the following code can't call back into Python code,
10257 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010258 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010259
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010260 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010261 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010262 res = _PyUnicode_JoinArray(separator, items, seqlen);
10263 Py_DECREF(fseq);
10264 return res;
10265}
10266
10267PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010268_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010269{
10270 PyObject *res = NULL; /* the result */
10271 PyObject *sep = NULL;
10272 Py_ssize_t seplen;
10273 PyObject *item;
10274 Py_ssize_t sz, i, res_offset;
10275 Py_UCS4 maxchar;
10276 Py_UCS4 item_maxchar;
10277 int use_memcpy;
10278 unsigned char *res_data = NULL, *sep_data = NULL;
10279 PyObject *last_obj;
10280 unsigned int kind = 0;
10281
Tim Peters05eba1f2004-08-27 21:32:02 +000010282 /* If empty sequence, return u"". */
10283 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010284 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010285 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010286
Tim Peters05eba1f2004-08-27 21:32:02 +000010287 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010288 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010289 if (seqlen == 1) {
10290 if (PyUnicode_CheckExact(items[0])) {
10291 res = items[0];
10292 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010293 return res;
10294 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010295 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010296 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010297 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010298 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010299 /* Set up sep and seplen */
10300 if (separator == NULL) {
10301 /* fall back to a blank space separator */
10302 sep = PyUnicode_FromOrdinal(' ');
10303 if (!sep)
10304 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010305 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010306 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010307 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010308 else {
10309 if (!PyUnicode_Check(separator)) {
10310 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010311 "separator: expected str instance,"
10312 " %.80s found",
10313 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010314 goto onError;
10315 }
10316 if (PyUnicode_READY(separator))
10317 goto onError;
10318 sep = separator;
10319 seplen = PyUnicode_GET_LENGTH(separator);
10320 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10321 /* inc refcount to keep this code path symmetric with the
10322 above case of a blank separator */
10323 Py_INCREF(sep);
10324 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010325 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010326 }
10327
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010328 /* There are at least two things to join, or else we have a subclass
10329 * of str in the sequence.
10330 * Do a pre-pass to figure out the total amount of space we'll
10331 * need (sz), and see whether all argument are strings.
10332 */
10333 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010334#ifdef Py_DEBUG
10335 use_memcpy = 0;
10336#else
10337 use_memcpy = 1;
10338#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010339 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010340 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010341 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010342 if (!PyUnicode_Check(item)) {
10343 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010344 "sequence item %zd: expected str instance,"
10345 " %.80s found",
10346 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010347 goto onError;
10348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 if (PyUnicode_READY(item) == -1)
10350 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010351 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010353 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010354 if (i != 0) {
10355 add_sz += seplen;
10356 }
10357 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010358 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010359 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010360 goto onError;
10361 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010362 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010363 if (use_memcpy && last_obj != NULL) {
10364 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10365 use_memcpy = 0;
10366 }
10367 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010368 }
Tim Petersced69f82003-09-16 20:30:58 +000010369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010371 if (res == NULL)
10372 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010373
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010374 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010375#ifdef Py_DEBUG
10376 use_memcpy = 0;
10377#else
10378 if (use_memcpy) {
10379 res_data = PyUnicode_1BYTE_DATA(res);
10380 kind = PyUnicode_KIND(res);
10381 if (seplen != 0)
10382 sep_data = PyUnicode_1BYTE_DATA(sep);
10383 }
10384#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010385 if (use_memcpy) {
10386 for (i = 0; i < seqlen; ++i) {
10387 Py_ssize_t itemlen;
10388 item = items[i];
10389
10390 /* Copy item, and maybe the separator. */
10391 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010392 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010393 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010394 kind * seplen);
10395 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010396 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010397
10398 itemlen = PyUnicode_GET_LENGTH(item);
10399 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010400 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010401 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010402 kind * itemlen);
10403 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010404 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010405 }
10406 assert(res_data == PyUnicode_1BYTE_DATA(res)
10407 + kind * PyUnicode_GET_LENGTH(res));
10408 }
10409 else {
10410 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10411 Py_ssize_t itemlen;
10412 item = items[i];
10413
10414 /* Copy item, and maybe the separator. */
10415 if (i && seplen != 0) {
10416 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10417 res_offset += seplen;
10418 }
10419
10420 itemlen = PyUnicode_GET_LENGTH(item);
10421 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010422 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010423 res_offset += itemlen;
10424 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010425 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010426 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010427 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010430 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432
Benjamin Peterson29060642009-01-31 22:14:21 +000010433 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010435 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436 return NULL;
10437}
10438
Victor Stinnerd3f08822012-05-29 12:57:52 +020010439void
10440_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10441 Py_UCS4 fill_char)
10442{
10443 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010444 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010445 assert(PyUnicode_IS_READY(unicode));
10446 assert(unicode_modifiable(unicode));
10447 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10448 assert(start >= 0);
10449 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010450 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010451}
10452
Victor Stinner3fe55312012-01-04 00:33:50 +010010453Py_ssize_t
10454PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10455 Py_UCS4 fill_char)
10456{
10457 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010458
10459 if (!PyUnicode_Check(unicode)) {
10460 PyErr_BadInternalCall();
10461 return -1;
10462 }
10463 if (PyUnicode_READY(unicode) == -1)
10464 return -1;
10465 if (unicode_check_modifiable(unicode))
10466 return -1;
10467
Victor Stinnerd3f08822012-05-29 12:57:52 +020010468 if (start < 0) {
10469 PyErr_SetString(PyExc_IndexError, "string index out of range");
10470 return -1;
10471 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010472 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10473 PyErr_SetString(PyExc_ValueError,
10474 "fill character is bigger than "
10475 "the string maximum character");
10476 return -1;
10477 }
10478
10479 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10480 length = Py_MIN(maxlen, length);
10481 if (length <= 0)
10482 return 0;
10483
Victor Stinnerd3f08822012-05-29 12:57:52 +020010484 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010485 return length;
10486}
10487
Victor Stinner9310abb2011-10-05 00:59:23 +020010488static PyObject *
10489pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010490 Py_ssize_t left,
10491 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 PyObject *u;
10495 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010496 int kind;
10497 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498
10499 if (left < 0)
10500 left = 0;
10501 if (right < 0)
10502 right = 0;
10503
Victor Stinnerc4b49542011-12-11 22:44:26 +010010504 if (left == 0 && right == 0)
10505 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10508 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010509 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10510 return NULL;
10511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010513 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010515 if (!u)
10516 return NULL;
10517
10518 kind = PyUnicode_KIND(u);
10519 data = PyUnicode_DATA(u);
10520 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010521 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010522 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010523 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010524 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010525 assert(_PyUnicode_CheckConsistency(u, 1));
10526 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527}
10528
Alexander Belopolsky40018472011-02-26 01:02:56 +000010529PyObject *
10530PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010534 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536
Benjamin Petersonead6b532011-12-20 17:23:42 -060010537 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010539 if (PyUnicode_IS_ASCII(string))
10540 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010541 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010542 PyUnicode_GET_LENGTH(string), keepends);
10543 else
10544 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010545 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010546 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 break;
10548 case PyUnicode_2BYTE_KIND:
10549 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010550 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 PyUnicode_GET_LENGTH(string), keepends);
10552 break;
10553 case PyUnicode_4BYTE_KIND:
10554 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010555 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 PyUnicode_GET_LENGTH(string), keepends);
10557 break;
10558 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010559 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562}
10563
Alexander Belopolsky40018472011-02-26 01:02:56 +000010564static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010565split(PyObject *self,
10566 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010567 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010569 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010570 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 Py_ssize_t len1, len2;
10572 PyObject* out;
10573
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010575 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 if (PyUnicode_READY(self) == -1)
10578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010581 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010583 if (PyUnicode_IS_ASCII(self))
10584 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010585 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010586 PyUnicode_GET_LENGTH(self), maxcount
10587 );
10588 else
10589 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010590 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010591 PyUnicode_GET_LENGTH(self), maxcount
10592 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 case PyUnicode_2BYTE_KIND:
10594 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010595 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 PyUnicode_GET_LENGTH(self), maxcount
10597 );
10598 case PyUnicode_4BYTE_KIND:
10599 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010600 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 PyUnicode_GET_LENGTH(self), maxcount
10602 );
10603 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010604 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 }
10606
10607 if (PyUnicode_READY(substring) == -1)
10608 return NULL;
10609
10610 kind1 = PyUnicode_KIND(self);
10611 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 len1 = PyUnicode_GET_LENGTH(self);
10613 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010614 if (kind1 < kind2 || len1 < len2) {
10615 out = PyList_New(1);
10616 if (out == NULL)
10617 return NULL;
10618 Py_INCREF(self);
10619 PyList_SET_ITEM(out, 0, self);
10620 return out;
10621 }
10622 buf1 = PyUnicode_DATA(self);
10623 buf2 = PyUnicode_DATA(substring);
10624 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010625 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010626 if (!buf2)
10627 return NULL;
10628 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010630 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010632 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10633 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010634 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010635 else
10636 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010637 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 break;
10639 case PyUnicode_2BYTE_KIND:
10640 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010641 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 break;
10643 case PyUnicode_4BYTE_KIND:
10644 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010645 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 break;
10647 default:
10648 out = NULL;
10649 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010650 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010651 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010652 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010654}
10655
Alexander Belopolsky40018472011-02-26 01:02:56 +000010656static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010657rsplit(PyObject *self,
10658 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010659 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010660{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010661 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010662 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 Py_ssize_t len1, len2;
10664 PyObject* out;
10665
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010666 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010667 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 if (PyUnicode_READY(self) == -1)
10670 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010673 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010675 if (PyUnicode_IS_ASCII(self))
10676 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010677 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010678 PyUnicode_GET_LENGTH(self), maxcount
10679 );
10680 else
10681 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010682 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010683 PyUnicode_GET_LENGTH(self), maxcount
10684 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 case PyUnicode_2BYTE_KIND:
10686 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010687 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 PyUnicode_GET_LENGTH(self), maxcount
10689 );
10690 case PyUnicode_4BYTE_KIND:
10691 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010692 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 PyUnicode_GET_LENGTH(self), maxcount
10694 );
10695 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010696 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 }
10698
10699 if (PyUnicode_READY(substring) == -1)
10700 return NULL;
10701
10702 kind1 = PyUnicode_KIND(self);
10703 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 len1 = PyUnicode_GET_LENGTH(self);
10705 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010706 if (kind1 < kind2 || len1 < len2) {
10707 out = PyList_New(1);
10708 if (out == NULL)
10709 return NULL;
10710 Py_INCREF(self);
10711 PyList_SET_ITEM(out, 0, self);
10712 return out;
10713 }
10714 buf1 = PyUnicode_DATA(self);
10715 buf2 = PyUnicode_DATA(substring);
10716 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010717 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010718 if (!buf2)
10719 return NULL;
10720 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010722 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010724 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10725 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010726 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010727 else
10728 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010729 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 break;
10731 case PyUnicode_2BYTE_KIND:
10732 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010733 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 break;
10735 case PyUnicode_4BYTE_KIND:
10736 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010737 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 break;
10739 default:
10740 out = NULL;
10741 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010742 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010743 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010744 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 return out;
10746}
10747
10748static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010749anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10750 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010752 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010754 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10755 return asciilib_find(buf1, len1, buf2, len2, offset);
10756 else
10757 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 case PyUnicode_2BYTE_KIND:
10759 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10760 case PyUnicode_4BYTE_KIND:
10761 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10762 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010763 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764}
10765
10766static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010767anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10768 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010770 switch (kind) {
10771 case PyUnicode_1BYTE_KIND:
10772 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10773 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10774 else
10775 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10776 case PyUnicode_2BYTE_KIND:
10777 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10778 case PyUnicode_4BYTE_KIND:
10779 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10780 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010781 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010782}
10783
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010784static void
10785replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10786 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10787{
10788 int kind = PyUnicode_KIND(u);
10789 void *data = PyUnicode_DATA(u);
10790 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10791 if (kind == PyUnicode_1BYTE_KIND) {
10792 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10793 (Py_UCS1 *)data + len,
10794 u1, u2, maxcount);
10795 }
10796 else if (kind == PyUnicode_2BYTE_KIND) {
10797 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10798 (Py_UCS2 *)data + len,
10799 u1, u2, maxcount);
10800 }
10801 else {
10802 assert(kind == PyUnicode_4BYTE_KIND);
10803 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10804 (Py_UCS4 *)data + len,
10805 u1, u2, maxcount);
10806 }
10807}
10808
Alexander Belopolsky40018472011-02-26 01:02:56 +000010809static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810replace(PyObject *self, PyObject *str1,
10811 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010814 const char *sbuf = PyUnicode_DATA(self);
10815 const void *buf1 = PyUnicode_DATA(str1);
10816 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 int srelease = 0, release1 = 0, release2 = 0;
10818 int skind = PyUnicode_KIND(self);
10819 int kind1 = PyUnicode_KIND(str1);
10820 int kind2 = PyUnicode_KIND(str2);
10821 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10822 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10823 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010824 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010825 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010827 if (slen < len1)
10828 goto nothing;
10829
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010831 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010832 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010833 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
Victor Stinner59de0ee2011-10-07 10:01:28 +020010835 if (str1 == str2)
10836 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837
Victor Stinner49a0a212011-10-12 23:46:10 +020010838 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010839 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10840 if (maxchar < maxchar_str1)
10841 /* substring too wide to be present */
10842 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010843 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10844 /* Replacing str1 with str2 may cause a maxchar reduction in the
10845 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010846 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010847 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010850 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010852 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010854 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010855 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010856 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010857
Victor Stinner69ed0f42013-04-09 21:48:24 +020010858 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010859 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010860 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010861 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010862 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010864 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010866
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010867 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10868 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010869 }
10870 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010871 int rkind = skind;
10872 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010873 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 if (kind1 < rkind) {
10876 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010877 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 if (!buf1) goto error;
10879 release1 = 1;
10880 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010881 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010882 if (i < 0)
10883 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884 if (rkind > kind2) {
10885 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010886 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 if (!buf2) goto error;
10888 release2 = 1;
10889 }
10890 else if (rkind < kind2) {
10891 /* widen self and buf1 */
10892 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010893 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010894 assert(buf1 != PyUnicode_DATA(str1));
10895 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010896 buf1 = PyUnicode_DATA(str1);
10897 release1 = 0;
10898 }
10899 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 if (!sbuf) goto error;
10901 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010902 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903 if (!buf1) goto error;
10904 release1 = 1;
10905 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010906 u = PyUnicode_New(slen, maxchar);
10907 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010909 assert(PyUnicode_KIND(u) == rkind);
10910 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010911
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010912 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010913 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010914 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010915 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010916 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010918
10919 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010920 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010921 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010922 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010923 if (i == -1)
10924 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010925 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010927 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010931 }
10932 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010934 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 int rkind = skind;
10936 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010939 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010940 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 if (!buf1) goto error;
10942 release1 = 1;
10943 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010944 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010945 if (n == 0)
10946 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010948 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010949 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 if (!buf2) goto error;
10951 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010954 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010956 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 if (!sbuf) goto error;
10958 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010959 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010960 assert(buf1 != PyUnicode_DATA(str1));
10961 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010962 buf1 = PyUnicode_DATA(str1);
10963 release1 = 0;
10964 }
10965 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 if (!buf1) goto error;
10967 release1 = 1;
10968 }
10969 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10970 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010971 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 PyErr_SetString(PyExc_OverflowError,
10973 "replace string is too long");
10974 goto error;
10975 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010976 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010977 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010978 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010979 goto done;
10980 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010981 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 PyErr_SetString(PyExc_OverflowError,
10983 "replace string is too long");
10984 goto error;
10985 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010986 u = PyUnicode_New(new_size, maxchar);
10987 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010989 assert(PyUnicode_KIND(u) == rkind);
10990 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 ires = i = 0;
10992 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010993 while (n-- > 0) {
10994 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010995 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010996 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010997 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010998 if (j == -1)
10999 break;
11000 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011001 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011002 memcpy(res + rkind * ires,
11003 sbuf + rkind * i,
11004 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011006 }
11007 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011009 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011011 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011017 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011018 memcpy(res + rkind * ires,
11019 sbuf + rkind * i,
11020 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020011021 }
11022 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011023 /* interleave */
11024 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011025 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011027 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011029 if (--n <= 0)
11030 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011031 memcpy(res + rkind * ires,
11032 sbuf + rkind * i,
11033 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 ires++;
11035 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011036 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011037 memcpy(res + rkind * ires,
11038 sbuf + rkind * i,
11039 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011040 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011041 }
11042
11043 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011044 unicode_adjust_maxchar(&u);
11045 if (u == NULL)
11046 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011048
11049 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011050 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11051 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11052 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011054 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011056 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011058 PyMem_Free((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011059 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011061
Benjamin Peterson29060642009-01-31 22:14:21 +000011062 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011063 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011064 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11065 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11066 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011068 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011070 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011072 PyMem_Free((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011073 return unicode_result_unchanged(self);
11074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011076 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11077 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11078 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11079 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011080 PyMem_Free((void *)sbuf);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011081 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011082 PyMem_Free((void *)buf1);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011083 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011084 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011086}
11087
11088/* --- Unicode Object Methods --------------------------------------------- */
11089
INADA Naoki3ae20562017-01-16 20:41:20 +090011090/*[clinic input]
11091str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092
INADA Naoki3ae20562017-01-16 20:41:20 +090011093Return a version of the string where each word is titlecased.
11094
11095More specifically, words start with uppercased characters and all remaining
11096cased characters have lower case.
11097[clinic start generated code]*/
11098
11099static PyObject *
11100unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011101/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011102{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011103 if (PyUnicode_READY(self) == -1)
11104 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011105 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106}
11107
INADA Naoki3ae20562017-01-16 20:41:20 +090011108/*[clinic input]
11109str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110
INADA Naoki3ae20562017-01-16 20:41:20 +090011111Return a capitalized version of the string.
11112
11113More specifically, make the first character have upper case and the rest lower
11114case.
11115[clinic start generated code]*/
11116
11117static PyObject *
11118unicode_capitalize_impl(PyObject *self)
11119/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011121 if (PyUnicode_READY(self) == -1)
11122 return NULL;
11123 if (PyUnicode_GET_LENGTH(self) == 0)
11124 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011125 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126}
11127
INADA Naoki3ae20562017-01-16 20:41:20 +090011128/*[clinic input]
11129str.casefold as unicode_casefold
11130
11131Return a version of the string suitable for caseless comparisons.
11132[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011133
11134static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011135unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011136/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011137{
11138 if (PyUnicode_READY(self) == -1)
11139 return NULL;
11140 if (PyUnicode_IS_ASCII(self))
11141 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011142 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011143}
11144
11145
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011146/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011147
11148static int
11149convert_uc(PyObject *obj, void *addr)
11150{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011152
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011153 if (!PyUnicode_Check(obj)) {
11154 PyErr_Format(PyExc_TypeError,
11155 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011156 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011157 return 0;
11158 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011159 if (PyUnicode_READY(obj) < 0)
11160 return 0;
11161 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011162 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011163 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011164 return 0;
11165 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011166 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011167 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011168}
11169
INADA Naoki3ae20562017-01-16 20:41:20 +090011170/*[clinic input]
11171str.center as unicode_center
11172
11173 width: Py_ssize_t
11174 fillchar: Py_UCS4 = ' '
11175 /
11176
11177Return a centered string of length width.
11178
11179Padding is done using the specified fill character (default is a space).
11180[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181
11182static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011183unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11184/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011186 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187
Benjamin Petersonbac79492012-01-14 13:34:47 -050011188 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189 return NULL;
11190
Victor Stinnerc4b49542011-12-11 22:44:26 +010011191 if (PyUnicode_GET_LENGTH(self) >= width)
11192 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193
Victor Stinnerc4b49542011-12-11 22:44:26 +010011194 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195 left = marg / 2 + (marg & width & 1);
11196
Victor Stinner9310abb2011-10-05 00:59:23 +020011197 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198}
11199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200/* This function assumes that str1 and str2 are readied by the caller. */
11201
Marc-André Lemburge5034372000-08-08 08:04:29 +000011202static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011203unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011204{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011205#define COMPARE(TYPE1, TYPE2) \
11206 do { \
11207 TYPE1* p1 = (TYPE1 *)data1; \
11208 TYPE2* p2 = (TYPE2 *)data2; \
11209 TYPE1* end = p1 + len; \
11210 Py_UCS4 c1, c2; \
11211 for (; p1 != end; p1++, p2++) { \
11212 c1 = *p1; \
11213 c2 = *p2; \
11214 if (c1 != c2) \
11215 return (c1 < c2) ? -1 : 1; \
11216 } \
11217 } \
11218 while (0)
11219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011221 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011222 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 kind1 = PyUnicode_KIND(str1);
11225 kind2 = PyUnicode_KIND(str2);
11226 data1 = PyUnicode_DATA(str1);
11227 data2 = PyUnicode_DATA(str2);
11228 len1 = PyUnicode_GET_LENGTH(str1);
11229 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011230 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011231
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011232 switch(kind1) {
11233 case PyUnicode_1BYTE_KIND:
11234 {
11235 switch(kind2) {
11236 case PyUnicode_1BYTE_KIND:
11237 {
11238 int cmp = memcmp(data1, data2, len);
11239 /* normalize result of memcmp() into the range [-1; 1] */
11240 if (cmp < 0)
11241 return -1;
11242 if (cmp > 0)
11243 return 1;
11244 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011245 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011246 case PyUnicode_2BYTE_KIND:
11247 COMPARE(Py_UCS1, Py_UCS2);
11248 break;
11249 case PyUnicode_4BYTE_KIND:
11250 COMPARE(Py_UCS1, Py_UCS4);
11251 break;
11252 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011253 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011254 }
11255 break;
11256 }
11257 case PyUnicode_2BYTE_KIND:
11258 {
11259 switch(kind2) {
11260 case PyUnicode_1BYTE_KIND:
11261 COMPARE(Py_UCS2, Py_UCS1);
11262 break;
11263 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011264 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011265 COMPARE(Py_UCS2, Py_UCS2);
11266 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011267 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011268 case PyUnicode_4BYTE_KIND:
11269 COMPARE(Py_UCS2, Py_UCS4);
11270 break;
11271 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011272 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011273 }
11274 break;
11275 }
11276 case PyUnicode_4BYTE_KIND:
11277 {
11278 switch(kind2) {
11279 case PyUnicode_1BYTE_KIND:
11280 COMPARE(Py_UCS4, Py_UCS1);
11281 break;
11282 case PyUnicode_2BYTE_KIND:
11283 COMPARE(Py_UCS4, Py_UCS2);
11284 break;
11285 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011286 {
11287#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11288 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11289 /* normalize result of wmemcmp() into the range [-1; 1] */
11290 if (cmp < 0)
11291 return -1;
11292 if (cmp > 0)
11293 return 1;
11294#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011295 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011296#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011297 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011298 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011299 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011300 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011301 }
11302 break;
11303 }
11304 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011305 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011306 }
11307
Victor Stinner770e19e2012-10-04 22:59:45 +020011308 if (len1 == len2)
11309 return 0;
11310 if (len1 < len2)
11311 return -1;
11312 else
11313 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011314
11315#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011316}
11317
Benjamin Peterson621b4302016-09-09 13:54:34 -070011318static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011319unicode_compare_eq(PyObject *str1, PyObject *str2)
11320{
11321 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011322 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011323 Py_ssize_t len;
11324 int cmp;
11325
Victor Stinnere5567ad2012-10-23 02:48:49 +020011326 len = PyUnicode_GET_LENGTH(str1);
11327 if (PyUnicode_GET_LENGTH(str2) != len)
11328 return 0;
11329 kind = PyUnicode_KIND(str1);
11330 if (PyUnicode_KIND(str2) != kind)
11331 return 0;
11332 data1 = PyUnicode_DATA(str1);
11333 data2 = PyUnicode_DATA(str2);
11334
11335 cmp = memcmp(data1, data2, len * kind);
11336 return (cmp == 0);
11337}
11338
11339
Alexander Belopolsky40018472011-02-26 01:02:56 +000011340int
11341PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11344 if (PyUnicode_READY(left) == -1 ||
11345 PyUnicode_READY(right) == -1)
11346 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011347
11348 /* a string is equal to itself */
11349 if (left == right)
11350 return 0;
11351
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011352 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011354 PyErr_Format(PyExc_TypeError,
11355 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011356 Py_TYPE(left)->tp_name,
11357 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358 return -1;
11359}
11360
Martin v. Löwis5b222132007-06-10 09:51:05 +000011361int
11362PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11363{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 Py_ssize_t i;
11365 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011367 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011368
Victor Stinner910337b2011-10-03 03:20:16 +020011369 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011370 if (!PyUnicode_IS_READY(uni)) {
11371 const wchar_t *ws = _PyUnicode_WSTR(uni);
11372 /* Compare Unicode string and source character set string */
11373 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11374 if (chr != ustr[i])
11375 return (chr < ustr[i]) ? -1 : 1;
11376 }
11377 /* This check keeps Python strings that end in '\0' from comparing equal
11378 to C strings identical up to that point. */
11379 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11380 return 1; /* uni is longer */
11381 if (ustr[i])
11382 return -1; /* str is longer */
11383 return 0;
11384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011386 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011387 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011388 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011389 size_t len, len2 = strlen(str);
11390 int cmp;
11391
11392 len = Py_MIN(len1, len2);
11393 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011394 if (cmp != 0) {
11395 if (cmp < 0)
11396 return -1;
11397 else
11398 return 1;
11399 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011400 if (len1 > len2)
11401 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011402 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011403 return -1; /* str is longer */
11404 return 0;
11405 }
11406 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011407 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011408 /* Compare Unicode string and source character set string */
11409 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011410 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011411 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11412 /* This check keeps Python strings that end in '\0' from comparing equal
11413 to C strings identical up to that point. */
11414 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11415 return 1; /* uni is longer */
11416 if (str[i])
11417 return -1; /* str is longer */
11418 return 0;
11419 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011420}
11421
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011422static int
11423non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11424{
11425 size_t i, len;
11426 const wchar_t *p;
11427 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11428 if (strlen(str) != len)
11429 return 0;
11430 p = _PyUnicode_WSTR(unicode);
11431 assert(p);
11432 for (i = 0; i < len; i++) {
11433 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011434 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011435 return 0;
11436 }
11437 return 1;
11438}
11439
11440int
11441_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11442{
11443 size_t len;
11444 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011445 assert(str);
11446#ifndef NDEBUG
11447 for (const char *p = str; *p; p++) {
11448 assert((unsigned char)*p < 128);
11449 }
11450#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011451 if (PyUnicode_READY(unicode) == -1) {
11452 /* Memory error or bad data */
11453 PyErr_Clear();
11454 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11455 }
11456 if (!PyUnicode_IS_ASCII(unicode))
11457 return 0;
11458 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11459 return strlen(str) == len &&
11460 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11461}
11462
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011463int
11464_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11465{
11466 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011467
11468 assert(_PyUnicode_CHECK(left));
11469 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011470#ifndef NDEBUG
11471 for (const char *p = right->string; *p; p++) {
11472 assert((unsigned char)*p < 128);
11473 }
11474#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011475
11476 if (PyUnicode_READY(left) == -1) {
11477 /* memory error or bad data */
11478 PyErr_Clear();
11479 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11480 }
11481
11482 if (!PyUnicode_IS_ASCII(left))
11483 return 0;
11484
11485 right_uni = _PyUnicode_FromId(right); /* borrowed */
11486 if (right_uni == NULL) {
11487 /* memory error or bad data */
11488 PyErr_Clear();
11489 return _PyUnicode_EqualToASCIIString(left, right->string);
11490 }
11491
11492 if (left == right_uni)
11493 return 1;
11494
11495 if (PyUnicode_CHECK_INTERNED(left))
11496 return 0;
11497
Victor Stinner607b1022020-05-05 18:50:30 +020011498#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011499 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011500 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011501 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11502 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011503#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011504
11505 return unicode_compare_eq(left, right_uni);
11506}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011507
Alexander Belopolsky40018472011-02-26 01:02:56 +000011508PyObject *
11509PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011510{
11511 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011512
Victor Stinnere5567ad2012-10-23 02:48:49 +020011513 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11514 Py_RETURN_NOTIMPLEMENTED;
11515
11516 if (PyUnicode_READY(left) == -1 ||
11517 PyUnicode_READY(right) == -1)
11518 return NULL;
11519
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011520 if (left == right) {
11521 switch (op) {
11522 case Py_EQ:
11523 case Py_LE:
11524 case Py_GE:
11525 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011526 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011527 case Py_NE:
11528 case Py_LT:
11529 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011530 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011531 default:
11532 PyErr_BadArgument();
11533 return NULL;
11534 }
11535 }
11536 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011537 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011538 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011539 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011540 }
11541 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011542 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011543 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011544 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011545}
11546
Alexander Belopolsky40018472011-02-26 01:02:56 +000011547int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011548_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11549{
11550 return unicode_eq(aa, bb);
11551}
11552
11553int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011554PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011555{
Victor Stinner77282cb2013-04-14 19:22:47 +020011556 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011557 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011559 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011560
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011561 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011563 "'in <string>' requires string as left operand, not %.100s",
11564 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011565 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011566 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011567 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011568 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011569 if (ensure_unicode(str) < 0)
11570 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011573 kind2 = PyUnicode_KIND(substr);
11574 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011575 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011577 len2 = PyUnicode_GET_LENGTH(substr);
11578 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011579 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011580 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011581 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011582 if (len2 == 1) {
11583 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11584 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011585 return result;
11586 }
11587 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011588 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011589 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011590 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592
Victor Stinner77282cb2013-04-14 19:22:47 +020011593 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 case PyUnicode_1BYTE_KIND:
11595 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11596 break;
11597 case PyUnicode_2BYTE_KIND:
11598 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11599 break;
11600 case PyUnicode_4BYTE_KIND:
11601 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11602 break;
11603 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011604 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011606
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011607 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011608 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011609 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610
Guido van Rossum403d68b2000-03-13 15:55:09 +000011611 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011612}
11613
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614/* Concat to string or Unicode object giving a new Unicode object. */
11615
Alexander Belopolsky40018472011-02-26 01:02:56 +000011616PyObject *
11617PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011619 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011620 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011621 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011623 if (ensure_unicode(left) < 0)
11624 return NULL;
11625
11626 if (!PyUnicode_Check(right)) {
11627 PyErr_Format(PyExc_TypeError,
11628 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011629 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011630 return NULL;
11631 }
11632 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011633 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634
11635 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011636 PyObject *empty = unicode_get_empty(); // Borrowed reference
11637 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011638 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011639 }
11640 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011641 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011644 left_len = PyUnicode_GET_LENGTH(left);
11645 right_len = PyUnicode_GET_LENGTH(right);
11646 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011647 PyErr_SetString(PyExc_OverflowError,
11648 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011649 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011650 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011651 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011652
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011653 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11654 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011655 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011658 result = PyUnicode_New(new_len, maxchar);
11659 if (result == NULL)
11660 return NULL;
11661 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11662 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11663 assert(_PyUnicode_CheckConsistency(result, 1));
11664 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665}
11666
Walter Dörwald1ab83302007-05-18 17:15:44 +000011667void
Victor Stinner23e56682011-10-03 03:54:37 +020011668PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011669{
Victor Stinner23e56682011-10-03 03:54:37 +020011670 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011671 Py_UCS4 maxchar, maxchar2;
11672 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011673
11674 if (p_left == NULL) {
11675 if (!PyErr_Occurred())
11676 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011677 return;
11678 }
Victor Stinner23e56682011-10-03 03:54:37 +020011679 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011680 if (right == NULL || left == NULL
11681 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011682 if (!PyErr_Occurred())
11683 PyErr_BadInternalCall();
11684 goto error;
11685 }
11686
Benjamin Petersonbac79492012-01-14 13:34:47 -050011687 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011688 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011689 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011690 goto error;
11691
Victor Stinner488fa492011-12-12 00:01:39 +010011692 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011693 PyObject *empty = unicode_get_empty(); // Borrowed reference
11694 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011695 Py_DECREF(left);
11696 Py_INCREF(right);
11697 *p_left = right;
11698 return;
11699 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011700 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011701 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011702 }
Victor Stinner488fa492011-12-12 00:01:39 +010011703
11704 left_len = PyUnicode_GET_LENGTH(left);
11705 right_len = PyUnicode_GET_LENGTH(right);
11706 if (left_len > PY_SSIZE_T_MAX - right_len) {
11707 PyErr_SetString(PyExc_OverflowError,
11708 "strings are too large to concat");
11709 goto error;
11710 }
11711 new_len = left_len + right_len;
11712
11713 if (unicode_modifiable(left)
11714 && PyUnicode_CheckExact(right)
11715 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011716 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11717 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011718 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011719 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011720 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11721 {
11722 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011723 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011724 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011725
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011726 /* copy 'right' into the newly allocated area of 'left' */
11727 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011728 }
Victor Stinner488fa492011-12-12 00:01:39 +010011729 else {
11730 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11731 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011732 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011733
Victor Stinner488fa492011-12-12 00:01:39 +010011734 /* Concat the two Unicode strings */
11735 res = PyUnicode_New(new_len, maxchar);
11736 if (res == NULL)
11737 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011738 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11739 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011740 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011741 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011742 }
11743 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011744 return;
11745
11746error:
Victor Stinner488fa492011-12-12 00:01:39 +010011747 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011748}
11749
11750void
11751PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11752{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011753 PyUnicode_Append(pleft, right);
11754 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011755}
11756
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011757/*
11758Wraps stringlib_parse_args_finds() and additionally ensures that the
11759first argument is a unicode object.
11760*/
11761
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011762static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011763parse_args_finds_unicode(const char * function_name, PyObject *args,
11764 PyObject **substring,
11765 Py_ssize_t *start, Py_ssize_t *end)
11766{
11767 if(stringlib_parse_args_finds(function_name, args, substring,
11768 start, end)) {
11769 if (ensure_unicode(*substring) < 0)
11770 return 0;
11771 return 1;
11772 }
11773 return 0;
11774}
11775
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011776PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011777 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011779Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011780string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011781interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782
11783static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011784unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011786 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011787 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011788 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011790 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011791 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011794 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011795 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 kind1 = PyUnicode_KIND(self);
11798 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011799 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011800 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 len1 = PyUnicode_GET_LENGTH(self);
11803 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011805 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011806 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011807
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011808 buf1 = PyUnicode_DATA(self);
11809 buf2 = PyUnicode_DATA(substring);
11810 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011811 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011812 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011813 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011814 }
11815 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 case PyUnicode_1BYTE_KIND:
11817 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011818 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 buf2, len2, PY_SSIZE_T_MAX
11820 );
11821 break;
11822 case PyUnicode_2BYTE_KIND:
11823 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011824 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 buf2, len2, PY_SSIZE_T_MAX
11826 );
11827 break;
11828 case PyUnicode_4BYTE_KIND:
11829 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011830 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 buf2, len2, PY_SSIZE_T_MAX
11832 );
11833 break;
11834 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011835 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 }
11837
11838 result = PyLong_FromSsize_t(iresult);
11839
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011840 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011841 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011842 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 return result;
11845}
11846
INADA Naoki3ae20562017-01-16 20:41:20 +090011847/*[clinic input]
11848str.encode as unicode_encode
11849
11850 encoding: str(c_default="NULL") = 'utf-8'
11851 The encoding in which to encode the string.
11852 errors: str(c_default="NULL") = 'strict'
11853 The error handling scheme to use for encoding errors.
11854 The default is 'strict' meaning that encoding errors raise a
11855 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11856 'xmlcharrefreplace' as well as any other name registered with
11857 codecs.register_error that can handle UnicodeEncodeErrors.
11858
11859Encode the string using the codec registered for encoding.
11860[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861
11862static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011863unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011864/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011866 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011867}
11868
INADA Naoki3ae20562017-01-16 20:41:20 +090011869/*[clinic input]
11870str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871
INADA Naoki3ae20562017-01-16 20:41:20 +090011872 tabsize: int = 8
11873
11874Return a copy where all tab characters are expanded using spaces.
11875
11876If tabsize is not given, a tab size of 8 characters is assumed.
11877[clinic start generated code]*/
11878
11879static PyObject *
11880unicode_expandtabs_impl(PyObject *self, int tabsize)
11881/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011883 Py_ssize_t i, j, line_pos, src_len, incr;
11884 Py_UCS4 ch;
11885 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011886 const void *src_data;
11887 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011888 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011889 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
Antoine Pitrou22425222011-10-04 19:10:51 +020011891 if (PyUnicode_READY(self) == -1)
11892 return NULL;
11893
Thomas Wouters7e474022000-07-16 12:04:32 +000011894 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011895 src_len = PyUnicode_GET_LENGTH(self);
11896 i = j = line_pos = 0;
11897 kind = PyUnicode_KIND(self);
11898 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011899 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011900 for (; i < src_len; i++) {
11901 ch = PyUnicode_READ(kind, src_data, i);
11902 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011903 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011905 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011907 goto overflow;
11908 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011910 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011914 goto overflow;
11915 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011917 if (ch == '\n' || ch == '\r')
11918 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011920 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011921 if (!found)
11922 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011923
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011925 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926 if (!u)
11927 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011928 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929
Antoine Pitroue71d5742011-10-04 15:55:09 +020011930 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
Antoine Pitroue71d5742011-10-04 15:55:09 +020011932 for (; i < src_len; i++) {
11933 ch = PyUnicode_READ(kind, src_data, i);
11934 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011936 incr = tabsize - (line_pos % tabsize);
11937 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011938 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011939 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011941 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011943 line_pos++;
11944 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011945 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011946 if (ch == '\n' || ch == '\r')
11947 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011949 }
11950 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011951 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011952
Antoine Pitroue71d5742011-10-04 15:55:09 +020011953 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011954 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11955 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956}
11957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011958PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960\n\
11961Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011962such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963arguments start and end are interpreted as in slice notation.\n\
11964\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011965Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966
11967static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011970 /* initialize variables to prevent gcc warning */
11971 PyObject *substring = NULL;
11972 Py_ssize_t start = 0;
11973 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011974 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011976 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011979 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011982 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 if (result == -2)
11985 return NULL;
11986
Christian Heimes217cfd12007-12-02 14:31:20 +000011987 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988}
11989
11990static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011991unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011993 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011994 enum PyUnicode_Kind kind;
11995 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011996
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011997 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011998 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012000 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012001 if (PyUnicode_READY(self) == -1) {
12002 return NULL;
12003 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012004 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12005 PyErr_SetString(PyExc_IndexError, "string index out of range");
12006 return NULL;
12007 }
12008 kind = PyUnicode_KIND(self);
12009 data = PyUnicode_DATA(self);
12010 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010012011 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012}
12013
Guido van Rossumc2504932007-09-18 19:42:40 +000012014/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010012015 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000012016static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012017unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080012019 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000012020
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012021#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050012022 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012023#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 if (_PyUnicode_HASH(self) != -1)
12025 return _PyUnicode_HASH(self);
12026 if (PyUnicode_READY(self) == -1)
12027 return -1;
animalizea1d14252019-01-02 20:16:06 +080012028
Christian Heimes985ecdc2013-11-20 11:46:18 +010012029 x = _Py_HashBytes(PyUnicode_DATA(self),
12030 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000012032 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033}
12034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012035PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012036 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037\n\
oldkaa0735f2018-02-02 16:52:55 +080012038Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012039such that sub is contained within S[start:end]. Optional\n\
12040arguments start and end are interpreted as in slice notation.\n\
12041\n\
12042Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043
12044static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012047 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012048 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012049 PyObject *substring = NULL;
12050 Py_ssize_t start = 0;
12051 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012053 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012056 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012059 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 if (result == -2)
12062 return NULL;
12063
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064 if (result < 0) {
12065 PyErr_SetString(PyExc_ValueError, "substring not found");
12066 return NULL;
12067 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012068
Christian Heimes217cfd12007-12-02 14:31:20 +000012069 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070}
12071
INADA Naoki3ae20562017-01-16 20:41:20 +090012072/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012073str.isascii as unicode_isascii
12074
12075Return True if all characters in the string are ASCII, False otherwise.
12076
12077ASCII characters have code points in the range U+0000-U+007F.
12078Empty string is ASCII too.
12079[clinic start generated code]*/
12080
12081static PyObject *
12082unicode_isascii_impl(PyObject *self)
12083/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12084{
12085 if (PyUnicode_READY(self) == -1) {
12086 return NULL;
12087 }
12088 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12089}
12090
12091/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012092str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093
INADA Naoki3ae20562017-01-16 20:41:20 +090012094Return True if the string is a lowercase string, False otherwise.
12095
12096A string is lowercase if all cased characters in the string are lowercase and
12097there is at least one cased character in the string.
12098[clinic start generated code]*/
12099
12100static PyObject *
12101unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012102/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 Py_ssize_t i, length;
12105 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012106 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107 int cased;
12108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 if (PyUnicode_READY(self) == -1)
12110 return NULL;
12111 length = PyUnicode_GET_LENGTH(self);
12112 kind = PyUnicode_KIND(self);
12113 data = PyUnicode_DATA(self);
12114
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 if (length == 1)
12117 return PyBool_FromLong(
12118 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012120 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012122 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012123
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 for (i = 0; i < length; i++) {
12126 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012127
Benjamin Peterson29060642009-01-31 22:14:21 +000012128 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012129 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012130 else if (!cased && Py_UNICODE_ISLOWER(ch))
12131 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012133 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134}
12135
INADA Naoki3ae20562017-01-16 20:41:20 +090012136/*[clinic input]
12137str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138
INADA Naoki3ae20562017-01-16 20:41:20 +090012139Return True if the string is an uppercase string, False otherwise.
12140
12141A string is uppercase if all cased characters in the string are uppercase and
12142there is at least one cased character in the string.
12143[clinic start generated code]*/
12144
12145static PyObject *
12146unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012147/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 Py_ssize_t i, length;
12150 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012151 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152 int cased;
12153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 if (PyUnicode_READY(self) == -1)
12155 return NULL;
12156 length = PyUnicode_GET_LENGTH(self);
12157 kind = PyUnicode_KIND(self);
12158 data = PyUnicode_DATA(self);
12159
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 if (length == 1)
12162 return PyBool_FromLong(
12163 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012165 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012167 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012168
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 for (i = 0; i < length; i++) {
12171 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012172
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012174 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012175 else if (!cased && Py_UNICODE_ISUPPER(ch))
12176 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012178 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179}
12180
INADA Naoki3ae20562017-01-16 20:41:20 +090012181/*[clinic input]
12182str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183
INADA Naoki3ae20562017-01-16 20:41:20 +090012184Return True if the string is a title-cased string, False otherwise.
12185
12186In a title-cased string, upper- and title-case characters may only
12187follow uncased characters and lowercase characters only cased ones.
12188[clinic start generated code]*/
12189
12190static PyObject *
12191unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012192/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 Py_ssize_t i, length;
12195 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012196 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197 int cased, previous_is_cased;
12198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 if (PyUnicode_READY(self) == -1)
12200 return NULL;
12201 length = PyUnicode_GET_LENGTH(self);
12202 kind = PyUnicode_KIND(self);
12203 data = PyUnicode_DATA(self);
12204
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 if (length == 1) {
12207 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12208 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12209 (Py_UNICODE_ISUPPER(ch) != 0));
12210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012212 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012214 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012215
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216 cased = 0;
12217 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 for (i = 0; i < length; i++) {
12219 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012220
Benjamin Peterson29060642009-01-31 22:14:21 +000012221 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12222 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012223 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012224 previous_is_cased = 1;
12225 cased = 1;
12226 }
12227 else if (Py_UNICODE_ISLOWER(ch)) {
12228 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012229 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 previous_is_cased = 1;
12231 cased = 1;
12232 }
12233 else
12234 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012236 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237}
12238
INADA Naoki3ae20562017-01-16 20:41:20 +090012239/*[clinic input]
12240str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241
INADA Naoki3ae20562017-01-16 20:41:20 +090012242Return True if the string is a whitespace string, False otherwise.
12243
12244A string is whitespace if all characters in the string are whitespace and there
12245is at least one character in the string.
12246[clinic start generated code]*/
12247
12248static PyObject *
12249unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012250/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 Py_ssize_t i, length;
12253 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012254 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255
12256 if (PyUnicode_READY(self) == -1)
12257 return NULL;
12258 length = PyUnicode_GET_LENGTH(self);
12259 kind = PyUnicode_KIND(self);
12260 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 if (length == 1)
12264 return PyBool_FromLong(
12265 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012267 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012269 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 for (i = 0; i < length; i++) {
12272 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012273 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012274 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012276 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277}
12278
INADA Naoki3ae20562017-01-16 20:41:20 +090012279/*[clinic input]
12280str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012281
INADA Naoki3ae20562017-01-16 20:41:20 +090012282Return True if the string is an alphabetic string, False otherwise.
12283
12284A string is alphabetic if all characters in the string are alphabetic and there
12285is at least one character in the string.
12286[clinic start generated code]*/
12287
12288static PyObject *
12289unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012290/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012291{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 Py_ssize_t i, length;
12293 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012294 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295
12296 if (PyUnicode_READY(self) == -1)
12297 return NULL;
12298 length = PyUnicode_GET_LENGTH(self);
12299 kind = PyUnicode_KIND(self);
12300 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012301
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012302 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 if (length == 1)
12304 return PyBool_FromLong(
12305 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012306
12307 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012309 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 for (i = 0; i < length; i++) {
12312 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012313 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012314 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012315 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012316}
12317
INADA Naoki3ae20562017-01-16 20:41:20 +090012318/*[clinic input]
12319str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012320
INADA Naoki3ae20562017-01-16 20:41:20 +090012321Return True if the string is an alpha-numeric string, False otherwise.
12322
12323A string is alpha-numeric if all characters in the string are alpha-numeric and
12324there is at least one character in the string.
12325[clinic start generated code]*/
12326
12327static PyObject *
12328unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012329/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012330{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012332 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 Py_ssize_t len, i;
12334
12335 if (PyUnicode_READY(self) == -1)
12336 return NULL;
12337
12338 kind = PyUnicode_KIND(self);
12339 data = PyUnicode_DATA(self);
12340 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012341
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012342 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 if (len == 1) {
12344 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12345 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12346 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012347
12348 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012350 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 for (i = 0; i < len; i++) {
12353 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012354 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012355 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012356 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012357 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012358}
12359
INADA Naoki3ae20562017-01-16 20:41:20 +090012360/*[clinic input]
12361str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362
INADA Naoki3ae20562017-01-16 20:41:20 +090012363Return True if the string is a decimal string, False otherwise.
12364
12365A string is a decimal string if all characters in the string are decimal and
12366there is at least one character in the string.
12367[clinic start generated code]*/
12368
12369static PyObject *
12370unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012371/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 Py_ssize_t i, length;
12374 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012375 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376
12377 if (PyUnicode_READY(self) == -1)
12378 return NULL;
12379 length = PyUnicode_GET_LENGTH(self);
12380 kind = PyUnicode_KIND(self);
12381 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 if (length == 1)
12385 return PyBool_FromLong(
12386 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012388 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012390 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 for (i = 0; i < length; i++) {
12393 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012394 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012396 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397}
12398
INADA Naoki3ae20562017-01-16 20:41:20 +090012399/*[clinic input]
12400str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401
INADA Naoki3ae20562017-01-16 20:41:20 +090012402Return True if the string is a digit string, False otherwise.
12403
12404A string is a digit string if all characters in the string are digits and there
12405is at least one character in the string.
12406[clinic start generated code]*/
12407
12408static PyObject *
12409unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012410/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 Py_ssize_t i, length;
12413 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012414 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415
12416 if (PyUnicode_READY(self) == -1)
12417 return NULL;
12418 length = PyUnicode_GET_LENGTH(self);
12419 kind = PyUnicode_KIND(self);
12420 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 if (length == 1) {
12424 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12425 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12426 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012428 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012430 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 for (i = 0; i < length; i++) {
12433 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012434 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012435 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012436 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437}
12438
INADA Naoki3ae20562017-01-16 20:41:20 +090012439/*[clinic input]
12440str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441
INADA Naoki3ae20562017-01-16 20:41:20 +090012442Return True if the string is a numeric string, False otherwise.
12443
12444A string is numeric if all characters in the string are numeric and there is at
12445least one character in the string.
12446[clinic start generated code]*/
12447
12448static PyObject *
12449unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012450/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452 Py_ssize_t i, length;
12453 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012454 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455
12456 if (PyUnicode_READY(self) == -1)
12457 return NULL;
12458 length = PyUnicode_GET_LENGTH(self);
12459 kind = PyUnicode_KIND(self);
12460 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 if (length == 1)
12464 return PyBool_FromLong(
12465 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012467 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012469 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 for (i = 0; i < length; i++) {
12472 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012473 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012475 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476}
12477
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012478Py_ssize_t
12479_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012482 if (PyUnicode_READY(self) == -1)
12483 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012484
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012485 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012486 if (len == 0) {
12487 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012488 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 }
12490
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012491 int kind = PyUnicode_KIND(self);
12492 const void *data = PyUnicode_DATA(self);
12493 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012494 /* PEP 3131 says that the first character must be in
12495 XID_Start and subsequent characters in XID_Continue,
12496 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012497 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012498 letters, digits, underscore). However, given the current
12499 definition of XID_Start and XID_Continue, it is sufficient
12500 to check just for these, except that _ must be allowed
12501 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012502 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012503 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012504 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012505
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012506 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012507 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012508 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012509 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012510 }
12511 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012512 return i;
12513}
12514
12515int
12516PyUnicode_IsIdentifier(PyObject *self)
12517{
12518 if (PyUnicode_IS_READY(self)) {
12519 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12520 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12521 /* an empty string is not a valid identifier */
12522 return len && i == len;
12523 }
12524 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012525_Py_COMP_DIAG_PUSH
12526_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012527 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012528 if (len == 0) {
12529 /* an empty string is not a valid identifier */
12530 return 0;
12531 }
12532
12533 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012534 Py_UCS4 ch = wstr[i++];
12535#if SIZEOF_WCHAR_T == 2
12536 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12537 && i < len
12538 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12539 {
12540 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12541 i++;
12542 }
12543#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012544 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12545 return 0;
12546 }
12547
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012548 while (i < len) {
12549 ch = wstr[i++];
12550#if SIZEOF_WCHAR_T == 2
12551 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12552 && i < len
12553 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12554 {
12555 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12556 i++;
12557 }
12558#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012559 if (!_PyUnicode_IsXidContinue(ch)) {
12560 return 0;
12561 }
12562 }
12563 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012564_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012565 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012566}
12567
INADA Naoki3ae20562017-01-16 20:41:20 +090012568/*[clinic input]
12569str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012570
INADA Naoki3ae20562017-01-16 20:41:20 +090012571Return True if the string is a valid Python identifier, False otherwise.
12572
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012573Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012574such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012575[clinic start generated code]*/
12576
12577static PyObject *
12578unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012579/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012580{
12581 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12582}
12583
INADA Naoki3ae20562017-01-16 20:41:20 +090012584/*[clinic input]
12585str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012586
INADA Naoki3ae20562017-01-16 20:41:20 +090012587Return True if the string is printable, False otherwise.
12588
12589A string is printable if all of its characters are considered printable in
12590repr() or if it is empty.
12591[clinic start generated code]*/
12592
12593static PyObject *
12594unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012595/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012596{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597 Py_ssize_t i, length;
12598 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012599 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600
12601 if (PyUnicode_READY(self) == -1)
12602 return NULL;
12603 length = PyUnicode_GET_LENGTH(self);
12604 kind = PyUnicode_KIND(self);
12605 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012606
12607 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 if (length == 1)
12609 return PyBool_FromLong(
12610 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 for (i = 0; i < length; i++) {
12613 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012614 Py_RETURN_FALSE;
12615 }
12616 }
12617 Py_RETURN_TRUE;
12618}
12619
INADA Naoki3ae20562017-01-16 20:41:20 +090012620/*[clinic input]
12621str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622
INADA Naoki3ae20562017-01-16 20:41:20 +090012623 iterable: object
12624 /
12625
12626Concatenate any number of strings.
12627
Martin Panter91a88662017-01-24 00:30:06 +000012628The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012629The result is returned as a new string.
12630
12631Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12632[clinic start generated code]*/
12633
12634static PyObject *
12635unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012636/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637{
INADA Naoki3ae20562017-01-16 20:41:20 +090012638 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639}
12640
Martin v. Löwis18e16552006-02-15 17:27:45 +000012641static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012642unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 if (PyUnicode_READY(self) == -1)
12645 return -1;
12646 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647}
12648
INADA Naoki3ae20562017-01-16 20:41:20 +090012649/*[clinic input]
12650str.ljust as unicode_ljust
12651
12652 width: Py_ssize_t
12653 fillchar: Py_UCS4 = ' '
12654 /
12655
12656Return a left-justified string of length width.
12657
12658Padding is done using the specified fill character (default is a space).
12659[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660
12661static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012662unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12663/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012665 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667
Victor Stinnerc4b49542011-12-11 22:44:26 +010012668 if (PyUnicode_GET_LENGTH(self) >= width)
12669 return unicode_result_unchanged(self);
12670
12671 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672}
12673
INADA Naoki3ae20562017-01-16 20:41:20 +090012674/*[clinic input]
12675str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676
INADA Naoki3ae20562017-01-16 20:41:20 +090012677Return a copy of the string converted to lowercase.
12678[clinic start generated code]*/
12679
12680static PyObject *
12681unicode_lower_impl(PyObject *self)
12682/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012684 if (PyUnicode_READY(self) == -1)
12685 return NULL;
12686 if (PyUnicode_IS_ASCII(self))
12687 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012688 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689}
12690
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012691#define LEFTSTRIP 0
12692#define RIGHTSTRIP 1
12693#define BOTHSTRIP 2
12694
12695/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012696static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012697
INADA Naoki3ae20562017-01-16 20:41:20 +090012698#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012699
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012700/* externally visible for str.strip(unicode) */
12701PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012702_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012703{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012704 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 int kind;
12706 Py_ssize_t i, j, len;
12707 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012708 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12711 return NULL;
12712
12713 kind = PyUnicode_KIND(self);
12714 data = PyUnicode_DATA(self);
12715 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012716 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12718 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012719 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012720
Benjamin Peterson14339b62009-01-31 16:36:08 +000012721 i = 0;
12722 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012723 while (i < len) {
12724 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12725 if (!BLOOM(sepmask, ch))
12726 break;
12727 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12728 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012729 i++;
12730 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012731 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012732
Benjamin Peterson14339b62009-01-31 16:36:08 +000012733 j = len;
12734 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012735 j--;
12736 while (j >= i) {
12737 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12738 if (!BLOOM(sepmask, ch))
12739 break;
12740 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12741 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012743 }
12744
Benjamin Peterson29060642009-01-31 22:14:21 +000012745 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012747
Victor Stinner7931d9a2011-11-04 00:22:48 +010012748 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749}
12750
12751PyObject*
12752PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12753{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012754 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012756 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757
Victor Stinnerde636f32011-10-01 03:55:54 +020012758 if (PyUnicode_READY(self) == -1)
12759 return NULL;
12760
Victor Stinner684d5fd2012-05-03 02:32:34 +020012761 length = PyUnicode_GET_LENGTH(self);
12762 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012763
Victor Stinner684d5fd2012-05-03 02:32:34 +020012764 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012765 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012766
Victor Stinnerde636f32011-10-01 03:55:54 +020012767 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012768 PyErr_SetString(PyExc_IndexError, "string index out of range");
12769 return NULL;
12770 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012771 if (start >= length || end < start)
12772 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012773
Victor Stinner684d5fd2012-05-03 02:32:34 +020012774 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012775 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012776 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012777 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012778 }
12779 else {
12780 kind = PyUnicode_KIND(self);
12781 data = PyUnicode_1BYTE_DATA(self);
12782 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012783 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012784 length);
12785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787
12788static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012789do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 Py_ssize_t len, i, j;
12792
12793 if (PyUnicode_READY(self) == -1)
12794 return NULL;
12795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012797
Victor Stinnercc7af722013-04-09 22:39:24 +020012798 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012799 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012800
12801 i = 0;
12802 if (striptype != RIGHTSTRIP) {
12803 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012804 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012805 if (!_Py_ascii_whitespace[ch])
12806 break;
12807 i++;
12808 }
12809 }
12810
12811 j = len;
12812 if (striptype != LEFTSTRIP) {
12813 j--;
12814 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012815 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012816 if (!_Py_ascii_whitespace[ch])
12817 break;
12818 j--;
12819 }
12820 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012821 }
12822 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012823 else {
12824 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012825 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012826
Victor Stinnercc7af722013-04-09 22:39:24 +020012827 i = 0;
12828 if (striptype != RIGHTSTRIP) {
12829 while (i < len) {
12830 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12831 if (!Py_UNICODE_ISSPACE(ch))
12832 break;
12833 i++;
12834 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012835 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012836
12837 j = len;
12838 if (striptype != LEFTSTRIP) {
12839 j--;
12840 while (j >= i) {
12841 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12842 if (!Py_UNICODE_ISSPACE(ch))
12843 break;
12844 j--;
12845 }
12846 j++;
12847 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012848 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012849
Victor Stinner7931d9a2011-11-04 00:22:48 +010012850 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851}
12852
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012853
12854static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012855do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012856{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012857 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012858 if (PyUnicode_Check(sep))
12859 return _PyUnicode_XStrip(self, striptype, sep);
12860 else {
12861 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 "%s arg must be None or str",
12863 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012864 return NULL;
12865 }
12866 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012867
Benjamin Peterson14339b62009-01-31 16:36:08 +000012868 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012869}
12870
12871
INADA Naoki3ae20562017-01-16 20:41:20 +090012872/*[clinic input]
12873str.strip as unicode_strip
12874
12875 chars: object = None
12876 /
12877
Zachary Ware09895c22019-10-09 16:09:00 -050012878Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012879
12880If chars is given and not None, remove characters in chars instead.
12881[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012882
12883static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012884unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012885/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012886{
INADA Naoki3ae20562017-01-16 20:41:20 +090012887 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012888}
12889
12890
INADA Naoki3ae20562017-01-16 20:41:20 +090012891/*[clinic input]
12892str.lstrip as unicode_lstrip
12893
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012894 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012895 /
12896
12897Return a copy of the string with leading whitespace removed.
12898
12899If chars is given and not None, remove characters in chars instead.
12900[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012901
12902static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012903unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012904/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012905{
INADA Naoki3ae20562017-01-16 20:41:20 +090012906 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012907}
12908
12909
INADA Naoki3ae20562017-01-16 20:41:20 +090012910/*[clinic input]
12911str.rstrip as unicode_rstrip
12912
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012913 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012914 /
12915
12916Return a copy of the string with trailing whitespace removed.
12917
12918If chars is given and not None, remove characters in chars instead.
12919[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012920
12921static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012922unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012923/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012924{
INADA Naoki3ae20562017-01-16 20:41:20 +090012925 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012926}
12927
12928
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012930unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012931{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012932 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012934
Serhiy Storchaka05997252013-01-26 12:14:02 +020012935 if (len < 1)
12936 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012937
Victor Stinnerc4b49542011-12-11 22:44:26 +010012938 /* no repeat, return original string */
12939 if (len == 1)
12940 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012941
Benjamin Petersonbac79492012-01-14 13:34:47 -050012942 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012943 return NULL;
12944
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012945 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012946 PyErr_SetString(PyExc_OverflowError,
12947 "repeated string is too long");
12948 return NULL;
12949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012951
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012952 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012953 if (!u)
12954 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012955 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012958 int kind = PyUnicode_KIND(str);
12959 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012960 if (kind == PyUnicode_1BYTE_KIND) {
12961 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012962 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012963 }
12964 else if (kind == PyUnicode_2BYTE_KIND) {
12965 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012966 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012967 ucs2[n] = fill_char;
12968 } else {
12969 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12970 assert(kind == PyUnicode_4BYTE_KIND);
12971 for (n = 0; n < len; ++n)
12972 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012973 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 }
12975 else {
12976 /* number of characters copied this far */
12977 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012978 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012980 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012982 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012984 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012985 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987 }
12988
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012989 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012990 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012991}
12992
Alexander Belopolsky40018472011-02-26 01:02:56 +000012993PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012994PyUnicode_Replace(PyObject *str,
12995 PyObject *substr,
12996 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012997 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012998{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012999 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13000 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013002 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013003}
13004
INADA Naoki3ae20562017-01-16 20:41:20 +090013005/*[clinic input]
13006str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000013007
INADA Naoki3ae20562017-01-16 20:41:20 +090013008 old: unicode
13009 new: unicode
13010 count: Py_ssize_t = -1
13011 Maximum number of occurrences to replace.
13012 -1 (the default value) means replace all occurrences.
13013 /
13014
13015Return a copy with all occurrences of substring old replaced by new.
13016
13017If the optional argument count is given, only the first count occurrences are
13018replaced.
13019[clinic start generated code]*/
13020
13021static PyObject *
13022unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13023 Py_ssize_t count)
13024/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013025{
Benjamin Peterson22a29702012-01-02 09:00:30 -060013026 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013027 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090013028 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013029}
13030
sweeneydea81849b2020-04-22 17:05:48 -040013031/*[clinic input]
13032str.removeprefix as unicode_removeprefix
13033
13034 prefix: unicode
13035 /
13036
13037Return a str with the given prefix string removed if present.
13038
13039If the string starts with the prefix string, return string[len(prefix):].
13040Otherwise, return a copy of the original string.
13041[clinic start generated code]*/
13042
13043static PyObject *
13044unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13045/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13046{
13047 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13048 if (match == -1) {
13049 return NULL;
13050 }
13051 if (match) {
13052 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13053 PyUnicode_GET_LENGTH(self));
13054 }
13055 return unicode_result_unchanged(self);
13056}
13057
13058/*[clinic input]
13059str.removesuffix as unicode_removesuffix
13060
13061 suffix: unicode
13062 /
13063
13064Return a str with the given suffix string removed if present.
13065
13066If the string ends with the suffix string and that suffix is not empty,
13067return string[:-len(suffix)]. Otherwise, return a copy of the original
13068string.
13069[clinic start generated code]*/
13070
13071static PyObject *
13072unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13073/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13074{
13075 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13076 if (match == -1) {
13077 return NULL;
13078 }
13079 if (match) {
13080 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13081 - PyUnicode_GET_LENGTH(suffix));
13082 }
13083 return unicode_result_unchanged(self);
13084}
13085
Alexander Belopolsky40018472011-02-26 01:02:56 +000013086static PyObject *
13087unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013089 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090 Py_ssize_t isize;
13091 Py_ssize_t osize, squote, dquote, i, o;
13092 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013093 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013094 const void *idata;
13095 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013097 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013098 return NULL;
13099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013100 isize = PyUnicode_GET_LENGTH(unicode);
13101 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013103 /* Compute length of output, quote characters, and
13104 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013105 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 max = 127;
13107 squote = dquote = 0;
13108 ikind = PyUnicode_KIND(unicode);
13109 for (i = 0; i < isize; i++) {
13110 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013111 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013113 case '\'': squote++; break;
13114 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013115 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013116 incr = 2;
13117 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 default:
13119 /* Fast-path ASCII */
13120 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013121 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013123 ;
13124 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013127 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013129 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013131 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013133 if (osize > PY_SSIZE_T_MAX - incr) {
13134 PyErr_SetString(PyExc_OverflowError,
13135 "string is too long to generate repr");
13136 return NULL;
13137 }
13138 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 }
13140
13141 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013142 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013144 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 if (dquote)
13146 /* Both squote and dquote present. Use squote,
13147 and escape them */
13148 osize += squote;
13149 else
13150 quote = '"';
13151 }
Victor Stinner55c08782013-04-14 18:45:39 +020013152 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013153
13154 repr = PyUnicode_New(osize, max);
13155 if (repr == NULL)
13156 return NULL;
13157 okind = PyUnicode_KIND(repr);
13158 odata = PyUnicode_DATA(repr);
13159
13160 PyUnicode_WRITE(okind, odata, 0, quote);
13161 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013162 if (unchanged) {
13163 _PyUnicode_FastCopyCharacters(repr, 1,
13164 unicode, 0,
13165 isize);
13166 }
13167 else {
13168 for (i = 0, o = 1; i < isize; i++) {
13169 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013170
Victor Stinner55c08782013-04-14 18:45:39 +020013171 /* Escape quotes and backslashes */
13172 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013173 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013174 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013175 continue;
13176 }
13177
13178 /* Map special whitespace to '\t', \n', '\r' */
13179 if (ch == '\t') {
13180 PyUnicode_WRITE(okind, odata, o++, '\\');
13181 PyUnicode_WRITE(okind, odata, o++, 't');
13182 }
13183 else if (ch == '\n') {
13184 PyUnicode_WRITE(okind, odata, o++, '\\');
13185 PyUnicode_WRITE(okind, odata, o++, 'n');
13186 }
13187 else if (ch == '\r') {
13188 PyUnicode_WRITE(okind, odata, o++, '\\');
13189 PyUnicode_WRITE(okind, odata, o++, 'r');
13190 }
13191
13192 /* Map non-printable US ASCII to '\xhh' */
13193 else if (ch < ' ' || ch == 0x7F) {
13194 PyUnicode_WRITE(okind, odata, o++, '\\');
13195 PyUnicode_WRITE(okind, odata, o++, 'x');
13196 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13197 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13198 }
13199
13200 /* Copy ASCII characters as-is */
13201 else if (ch < 0x7F) {
13202 PyUnicode_WRITE(okind, odata, o++, ch);
13203 }
13204
13205 /* Non-ASCII characters */
13206 else {
13207 /* Map Unicode whitespace and control characters
13208 (categories Z* and C* except ASCII space)
13209 */
13210 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13211 PyUnicode_WRITE(okind, odata, o++, '\\');
13212 /* Map 8-bit characters to '\xhh' */
13213 if (ch <= 0xff) {
13214 PyUnicode_WRITE(okind, odata, o++, 'x');
13215 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13216 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13217 }
13218 /* Map 16-bit characters to '\uxxxx' */
13219 else if (ch <= 0xffff) {
13220 PyUnicode_WRITE(okind, odata, o++, 'u');
13221 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13222 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13223 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13224 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13225 }
13226 /* Map 21-bit characters to '\U00xxxxxx' */
13227 else {
13228 PyUnicode_WRITE(okind, odata, o++, 'U');
13229 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13230 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13231 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13232 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13233 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13234 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13235 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13236 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13237 }
13238 }
13239 /* Copy characters as-is */
13240 else {
13241 PyUnicode_WRITE(okind, odata, o++, ch);
13242 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013243 }
13244 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013246 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013247 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013248 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249}
13250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013251PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013252 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253\n\
13254Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013255such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256arguments start and end are interpreted as in slice notation.\n\
13257\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013258Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013259
13260static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013261unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013263 /* initialize variables to prevent gcc warning */
13264 PyObject *substring = NULL;
13265 Py_ssize_t start = 0;
13266 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013267 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013269 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013272 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013273 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013274
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013275 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 if (result == -2)
13278 return NULL;
13279
Christian Heimes217cfd12007-12-02 14:31:20 +000013280 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281}
13282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013283PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013284 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013286Return the highest index in S where substring sub is found,\n\
13287such that sub is contained within S[start:end]. Optional\n\
13288arguments start and end are interpreted as in slice notation.\n\
13289\n\
13290Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291
13292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013295 /* initialize variables to prevent gcc warning */
13296 PyObject *substring = NULL;
13297 Py_ssize_t start = 0;
13298 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013299 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013301 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013304 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013307 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309 if (result == -2)
13310 return NULL;
13311
Guido van Rossumd57fd912000-03-10 22:53:23 +000013312 if (result < 0) {
13313 PyErr_SetString(PyExc_ValueError, "substring not found");
13314 return NULL;
13315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316
Christian Heimes217cfd12007-12-02 14:31:20 +000013317 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318}
13319
INADA Naoki3ae20562017-01-16 20:41:20 +090013320/*[clinic input]
13321str.rjust as unicode_rjust
13322
13323 width: Py_ssize_t
13324 fillchar: Py_UCS4 = ' '
13325 /
13326
13327Return a right-justified string of length width.
13328
13329Padding is done using the specified fill character (default is a space).
13330[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331
13332static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013333unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13334/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013336 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013337 return NULL;
13338
Victor Stinnerc4b49542011-12-11 22:44:26 +010013339 if (PyUnicode_GET_LENGTH(self) >= width)
13340 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341
Victor Stinnerc4b49542011-12-11 22:44:26 +010013342 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343}
13344
Alexander Belopolsky40018472011-02-26 01:02:56 +000013345PyObject *
13346PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013347{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013348 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013349 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013351 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013352}
13353
INADA Naoki3ae20562017-01-16 20:41:20 +090013354/*[clinic input]
13355str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356
INADA Naoki3ae20562017-01-16 20:41:20 +090013357 sep: object = None
13358 The delimiter according which to split the string.
13359 None (the default value) means split according to any whitespace,
13360 and discard empty strings from the result.
13361 maxsplit: Py_ssize_t = -1
13362 Maximum number of splits to do.
13363 -1 (the default value) means no limit.
13364
13365Return a list of the words in the string, using sep as the delimiter string.
13366[clinic start generated code]*/
13367
13368static PyObject *
13369unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13370/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371{
INADA Naoki3ae20562017-01-16 20:41:20 +090013372 if (sep == Py_None)
13373 return split(self, NULL, maxsplit);
13374 if (PyUnicode_Check(sep))
13375 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013376
Victor Stinner998b8062018-09-12 00:23:25 +020013377 PyErr_Format(PyExc_TypeError,
13378 "must be str or None, not %.100s",
13379 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013381}
13382
Thomas Wouters477c8d52006-05-27 19:21:47 +000013383PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013384PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013385{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013386 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013387 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013388 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013389 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013390
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013391 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013392 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013393
Victor Stinner14f8f022011-10-05 20:58:25 +020013394 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013395 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013396 len1 = PyUnicode_GET_LENGTH(str_obj);
13397 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013398 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013399 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013400 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013401 }
13402 buf1 = PyUnicode_DATA(str_obj);
13403 buf2 = PyUnicode_DATA(sep_obj);
13404 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013405 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013406 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013407 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013408 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013409
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013410 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013411 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013412 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13413 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13414 else
13415 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013416 break;
13417 case PyUnicode_2BYTE_KIND:
13418 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13419 break;
13420 case PyUnicode_4BYTE_KIND:
13421 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13422 break;
13423 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013424 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013425 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013426
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013427 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013428 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013429 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013430
13431 return out;
13432}
13433
13434
13435PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013436PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013437{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013438 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013439 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013440 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013441 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013442
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013443 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013445
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013446 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013447 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013448 len1 = PyUnicode_GET_LENGTH(str_obj);
13449 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013450 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013451 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013452 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013453 }
13454 buf1 = PyUnicode_DATA(str_obj);
13455 buf2 = PyUnicode_DATA(sep_obj);
13456 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013457 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013458 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013459 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013461
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013462 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013463 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013464 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13465 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13466 else
13467 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013468 break;
13469 case PyUnicode_2BYTE_KIND:
13470 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13471 break;
13472 case PyUnicode_4BYTE_KIND:
13473 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13474 break;
13475 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013476 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013477 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013478
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013479 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013480 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013481 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013482
13483 return out;
13484}
13485
INADA Naoki3ae20562017-01-16 20:41:20 +090013486/*[clinic input]
13487str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013488
INADA Naoki3ae20562017-01-16 20:41:20 +090013489 sep: object
13490 /
13491
13492Partition the string into three parts using the given separator.
13493
13494This will search for the separator in the string. If the separator is found,
13495returns a 3-tuple containing the part before the separator, the separator
13496itself, and the part after it.
13497
13498If the separator is not found, returns a 3-tuple containing the original string
13499and two empty strings.
13500[clinic start generated code]*/
13501
13502static PyObject *
13503unicode_partition(PyObject *self, PyObject *sep)
13504/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013505{
INADA Naoki3ae20562017-01-16 20:41:20 +090013506 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013507}
13508
INADA Naoki3ae20562017-01-16 20:41:20 +090013509/*[clinic input]
13510str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013511
INADA Naoki3ae20562017-01-16 20:41:20 +090013512Partition the string into three parts using the given separator.
13513
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013514This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013515the separator is found, returns a 3-tuple containing the part before the
13516separator, the separator itself, and the part after it.
13517
13518If the separator is not found, returns a 3-tuple containing two empty strings
13519and the original string.
13520[clinic start generated code]*/
13521
13522static PyObject *
13523unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013524/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013525{
INADA Naoki3ae20562017-01-16 20:41:20 +090013526 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013527}
13528
Alexander Belopolsky40018472011-02-26 01:02:56 +000013529PyObject *
13530PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013531{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013532 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013533 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013534
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013535 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013536}
13537
INADA Naoki3ae20562017-01-16 20:41:20 +090013538/*[clinic input]
13539str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013540
INADA Naoki3ae20562017-01-16 20:41:20 +090013541Return a list of the words in the string, using sep as the delimiter string.
13542
13543Splits are done starting at the end of the string and working to the front.
13544[clinic start generated code]*/
13545
13546static PyObject *
13547unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13548/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013549{
INADA Naoki3ae20562017-01-16 20:41:20 +090013550 if (sep == Py_None)
13551 return rsplit(self, NULL, maxsplit);
13552 if (PyUnicode_Check(sep))
13553 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013554
Victor Stinner998b8062018-09-12 00:23:25 +020013555 PyErr_Format(PyExc_TypeError,
13556 "must be str or None, not %.100s",
13557 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013558 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013559}
13560
INADA Naoki3ae20562017-01-16 20:41:20 +090013561/*[clinic input]
13562str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013563
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013564 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013565
13566Return a list of the lines in the string, breaking at line boundaries.
13567
13568Line breaks are not included in the resulting list unless keepends is given and
13569true.
13570[clinic start generated code]*/
13571
13572static PyObject *
13573unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013574/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013575{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013576 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013577}
13578
13579static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013580PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013581{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013582 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013583}
13584
INADA Naoki3ae20562017-01-16 20:41:20 +090013585/*[clinic input]
13586str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013587
INADA Naoki3ae20562017-01-16 20:41:20 +090013588Convert uppercase characters to lowercase and lowercase characters to uppercase.
13589[clinic start generated code]*/
13590
13591static PyObject *
13592unicode_swapcase_impl(PyObject *self)
13593/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013594{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013595 if (PyUnicode_READY(self) == -1)
13596 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013597 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013598}
13599
Larry Hastings61272b72014-01-07 12:41:53 -080013600/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013601
Larry Hastings31826802013-10-19 00:09:25 -070013602@staticmethod
13603str.maketrans as unicode_maketrans
13604
13605 x: object
13606
13607 y: unicode=NULL
13608
13609 z: unicode=NULL
13610
13611 /
13612
13613Return a translation table usable for str.translate().
13614
13615If there is only one argument, it must be a dictionary mapping Unicode
13616ordinals (integers) or characters to Unicode ordinals, strings or None.
13617Character keys will be then converted to ordinals.
13618If there are two arguments, they must be strings of equal length, and
13619in the resulting dictionary, each character in x will be mapped to the
13620character at the same position in y. If there is a third argument, it
13621must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013622[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013623
Larry Hastings31826802013-10-19 00:09:25 -070013624static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013625unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013626/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013627{
Georg Brandlceee0772007-11-27 23:48:05 +000013628 PyObject *new = NULL, *key, *value;
13629 Py_ssize_t i = 0;
13630 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013631
Georg Brandlceee0772007-11-27 23:48:05 +000013632 new = PyDict_New();
13633 if (!new)
13634 return NULL;
13635 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013636 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013637 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013638
Georg Brandlceee0772007-11-27 23:48:05 +000013639 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013640 if (!PyUnicode_Check(x)) {
13641 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13642 "be a string if there is a second argument");
13643 goto err;
13644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013645 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013646 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13647 "arguments must have equal length");
13648 goto err;
13649 }
13650 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013651 x_kind = PyUnicode_KIND(x);
13652 y_kind = PyUnicode_KIND(y);
13653 x_data = PyUnicode_DATA(x);
13654 y_data = PyUnicode_DATA(y);
13655 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13656 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013657 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013658 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013659 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013660 if (!value) {
13661 Py_DECREF(key);
13662 goto err;
13663 }
Georg Brandlceee0772007-11-27 23:48:05 +000013664 res = PyDict_SetItem(new, key, value);
13665 Py_DECREF(key);
13666 Py_DECREF(value);
13667 if (res < 0)
13668 goto err;
13669 }
13670 /* create entries for deleting chars in z */
13671 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013672 z_kind = PyUnicode_KIND(z);
13673 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013674 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013675 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013676 if (!key)
13677 goto err;
13678 res = PyDict_SetItem(new, key, Py_None);
13679 Py_DECREF(key);
13680 if (res < 0)
13681 goto err;
13682 }
13683 }
13684 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013685 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013686 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013687
Georg Brandlceee0772007-11-27 23:48:05 +000013688 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013689 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013690 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13691 "to maketrans it must be a dict");
13692 goto err;
13693 }
13694 /* copy entries into the new dict, converting string keys to int keys */
13695 while (PyDict_Next(x, &i, &key, &value)) {
13696 if (PyUnicode_Check(key)) {
13697 /* convert string keys to integer keys */
13698 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013699 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013700 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13701 "table must be of length 1");
13702 goto err;
13703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013704 kind = PyUnicode_KIND(key);
13705 data = PyUnicode_DATA(key);
13706 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013707 if (!newkey)
13708 goto err;
13709 res = PyDict_SetItem(new, newkey, value);
13710 Py_DECREF(newkey);
13711 if (res < 0)
13712 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013713 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013714 /* just keep integer keys */
13715 if (PyDict_SetItem(new, key, value) < 0)
13716 goto err;
13717 } else {
13718 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13719 "be strings or integers");
13720 goto err;
13721 }
13722 }
13723 }
13724 return new;
13725 err:
13726 Py_DECREF(new);
13727 return NULL;
13728}
13729
INADA Naoki3ae20562017-01-16 20:41:20 +090013730/*[clinic input]
13731str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013732
INADA Naoki3ae20562017-01-16 20:41:20 +090013733 table: object
13734 Translation table, which must be a mapping of Unicode ordinals to
13735 Unicode ordinals, strings, or None.
13736 /
13737
13738Replace each character in the string using the given translation table.
13739
13740The table must implement lookup/indexing via __getitem__, for instance a
13741dictionary or list. If this operation raises LookupError, the character is
13742left untouched. Characters mapped to None are deleted.
13743[clinic start generated code]*/
13744
13745static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013746unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013747/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013748{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013749 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013750}
13751
INADA Naoki3ae20562017-01-16 20:41:20 +090013752/*[clinic input]
13753str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013754
INADA Naoki3ae20562017-01-16 20:41:20 +090013755Return a copy of the string converted to uppercase.
13756[clinic start generated code]*/
13757
13758static PyObject *
13759unicode_upper_impl(PyObject *self)
13760/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013761{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013762 if (PyUnicode_READY(self) == -1)
13763 return NULL;
13764 if (PyUnicode_IS_ASCII(self))
13765 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013766 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013767}
13768
INADA Naoki3ae20562017-01-16 20:41:20 +090013769/*[clinic input]
13770str.zfill as unicode_zfill
13771
13772 width: Py_ssize_t
13773 /
13774
13775Pad a numeric string with zeros on the left, to fill a field of the given width.
13776
13777The string is never truncated.
13778[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013779
13780static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013781unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013782/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013783{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013784 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013785 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013786 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013787 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013788 Py_UCS4 chr;
13789
Benjamin Petersonbac79492012-01-14 13:34:47 -050013790 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013791 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013792
Victor Stinnerc4b49542011-12-11 22:44:26 +010013793 if (PyUnicode_GET_LENGTH(self) >= width)
13794 return unicode_result_unchanged(self);
13795
13796 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013797
13798 u = pad(self, fill, 0, '0');
13799
Walter Dörwald068325e2002-04-15 13:36:47 +000013800 if (u == NULL)
13801 return NULL;
13802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013803 kind = PyUnicode_KIND(u);
13804 data = PyUnicode_DATA(u);
13805 chr = PyUnicode_READ(kind, data, fill);
13806
13807 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013808 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013809 PyUnicode_WRITE(kind, data, 0, chr);
13810 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013811 }
13812
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013813 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013814 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013815}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013816
13817#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013818static PyObject *
13819unicode__decimal2ascii(PyObject *self)
13820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013821 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013822}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823#endif
13824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013825PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013826 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013827\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013828Return True if S starts with the specified prefix, False otherwise.\n\
13829With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013830With optional end, stop comparing S at that position.\n\
13831prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832
13833static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013834unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013835 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013836{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013837 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013838 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013839 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013840 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013841 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013842
Jesus Ceaac451502011-04-20 17:09:23 +020013843 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013844 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013845 if (PyTuple_Check(subobj)) {
13846 Py_ssize_t i;
13847 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013848 substring = PyTuple_GET_ITEM(subobj, i);
13849 if (!PyUnicode_Check(substring)) {
13850 PyErr_Format(PyExc_TypeError,
13851 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013852 "not %.100s",
13853 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013854 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013855 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013856 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013857 if (result == -1)
13858 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013859 if (result) {
13860 Py_RETURN_TRUE;
13861 }
13862 }
13863 /* nothing matched */
13864 Py_RETURN_FALSE;
13865 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013866 if (!PyUnicode_Check(subobj)) {
13867 PyErr_Format(PyExc_TypeError,
13868 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013869 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013870 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013871 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013872 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013873 if (result == -1)
13874 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013875 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013876}
13877
13878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013879PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013880 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013881\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013882Return True if S ends with the specified suffix, False otherwise.\n\
13883With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013884With optional end, stop comparing S at that position.\n\
13885suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013886
13887static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013888unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013889 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013890{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013891 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013892 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013893 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013894 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013895 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013896
Jesus Ceaac451502011-04-20 17:09:23 +020013897 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013898 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013899 if (PyTuple_Check(subobj)) {
13900 Py_ssize_t i;
13901 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013902 substring = PyTuple_GET_ITEM(subobj, i);
13903 if (!PyUnicode_Check(substring)) {
13904 PyErr_Format(PyExc_TypeError,
13905 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013906 "not %.100s",
13907 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013908 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013909 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013910 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013911 if (result == -1)
13912 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013913 if (result) {
13914 Py_RETURN_TRUE;
13915 }
13916 }
13917 Py_RETURN_FALSE;
13918 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013919 if (!PyUnicode_Check(subobj)) {
13920 PyErr_Format(PyExc_TypeError,
13921 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013922 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013923 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013924 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013925 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013926 if (result == -1)
13927 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013928 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013929}
13930
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013931static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013932_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013933{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013934 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13935 writer->data = PyUnicode_DATA(writer->buffer);
13936
13937 if (!writer->readonly) {
13938 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013939 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013940 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013941 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013942 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13943 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13944 writer->kind = PyUnicode_WCHAR_KIND;
13945 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13946
Victor Stinner8f674cc2013-04-17 23:02:17 +020013947 /* Copy-on-write mode: set buffer size to 0 so
13948 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13949 * next write. */
13950 writer->size = 0;
13951 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013952}
13953
Victor Stinnerd3f08822012-05-29 12:57:52 +020013954void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013955_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013956{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013957 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013958
13959 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013960 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013961
13962 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13963 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13964 writer->kind = PyUnicode_WCHAR_KIND;
13965 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013966}
13967
Inada Naoki770847a2019-06-24 12:30:24 +090013968// Initialize _PyUnicodeWriter with initial buffer
13969static inline void
13970_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13971{
13972 memset(writer, 0, sizeof(*writer));
13973 writer->buffer = buffer;
13974 _PyUnicodeWriter_Update(writer);
13975 writer->min_length = writer->size;
13976}
13977
Victor Stinnerd3f08822012-05-29 12:57:52 +020013978int
13979_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13980 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013981{
13982 Py_ssize_t newlen;
13983 PyObject *newbuffer;
13984
Victor Stinner2740e462016-09-06 16:58:36 -070013985 assert(maxchar <= MAX_UNICODE);
13986
Victor Stinnerca9381e2015-09-22 00:58:32 +020013987 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013988 assert((maxchar > writer->maxchar && length >= 0)
13989 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013990
Victor Stinner202fdca2012-05-07 12:47:02 +020013991 if (length > PY_SSIZE_T_MAX - writer->pos) {
13992 PyErr_NoMemory();
13993 return -1;
13994 }
13995 newlen = writer->pos + length;
13996
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013997 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013998
Victor Stinnerd3f08822012-05-29 12:57:52 +020013999 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020014000 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010014001 if (writer->overallocate
14002 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14003 /* overallocate to limit the number of realloc() */
14004 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014005 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014006 if (newlen < writer->min_length)
14007 newlen = writer->min_length;
14008
Victor Stinnerd3f08822012-05-29 12:57:52 +020014009 writer->buffer = PyUnicode_New(newlen, maxchar);
14010 if (writer->buffer == NULL)
14011 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014012 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014013 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010014014 if (writer->overallocate
14015 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14016 /* overallocate to limit the number of realloc() */
14017 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014018 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014019 if (newlen < writer->min_length)
14020 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014021
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014022 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020014023 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030014024 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020014025 newbuffer = PyUnicode_New(newlen, maxchar);
14026 if (newbuffer == NULL)
14027 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014028 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14029 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020014030 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014031 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020014032 }
14033 else {
14034 newbuffer = resize_compact(writer->buffer, newlen);
14035 if (newbuffer == NULL)
14036 return -1;
14037 }
14038 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014039 }
14040 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014041 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014042 newbuffer = PyUnicode_New(writer->size, maxchar);
14043 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014044 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014045 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14046 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014047 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014048 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014049 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014050 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014051
14052#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014053}
14054
Victor Stinnerca9381e2015-09-22 00:58:32 +020014055int
14056_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14057 enum PyUnicode_Kind kind)
14058{
14059 Py_UCS4 maxchar;
14060
14061 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14062 assert(writer->kind < kind);
14063
14064 switch (kind)
14065 {
14066 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14067 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14068 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14069 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014070 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014071 }
14072
14073 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14074}
14075
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014076static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014077_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014078{
Victor Stinner2740e462016-09-06 16:58:36 -070014079 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014080 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14081 return -1;
14082 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14083 writer->pos++;
14084 return 0;
14085}
14086
14087int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014088_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14089{
14090 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14091}
14092
14093int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014094_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14095{
14096 Py_UCS4 maxchar;
14097 Py_ssize_t len;
14098
14099 if (PyUnicode_READY(str) == -1)
14100 return -1;
14101 len = PyUnicode_GET_LENGTH(str);
14102 if (len == 0)
14103 return 0;
14104 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14105 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014106 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014107 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014108 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014109 Py_INCREF(str);
14110 writer->buffer = str;
14111 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014112 writer->pos += len;
14113 return 0;
14114 }
14115 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14116 return -1;
14117 }
14118 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14119 str, 0, len);
14120 writer->pos += len;
14121 return 0;
14122}
14123
Victor Stinnere215d962012-10-06 23:03:36 +020014124int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014125_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14126 Py_ssize_t start, Py_ssize_t end)
14127{
14128 Py_UCS4 maxchar;
14129 Py_ssize_t len;
14130
14131 if (PyUnicode_READY(str) == -1)
14132 return -1;
14133
14134 assert(0 <= start);
14135 assert(end <= PyUnicode_GET_LENGTH(str));
14136 assert(start <= end);
14137
14138 if (end == 0)
14139 return 0;
14140
14141 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14142 return _PyUnicodeWriter_WriteStr(writer, str);
14143
14144 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14145 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14146 else
14147 maxchar = writer->maxchar;
14148 len = end - start;
14149
14150 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14151 return -1;
14152
14153 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14154 str, start, len);
14155 writer->pos += len;
14156 return 0;
14157}
14158
14159int
Victor Stinner4a587072013-11-19 12:54:53 +010014160_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14161 const char *ascii, Py_ssize_t len)
14162{
14163 if (len == -1)
14164 len = strlen(ascii);
14165
Andy Lestere6be9b52020-02-11 20:28:35 -060014166 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014167
14168 if (writer->buffer == NULL && !writer->overallocate) {
14169 PyObject *str;
14170
14171 str = _PyUnicode_FromASCII(ascii, len);
14172 if (str == NULL)
14173 return -1;
14174
14175 writer->readonly = 1;
14176 writer->buffer = str;
14177 _PyUnicodeWriter_Update(writer);
14178 writer->pos += len;
14179 return 0;
14180 }
14181
14182 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14183 return -1;
14184
14185 switch (writer->kind)
14186 {
14187 case PyUnicode_1BYTE_KIND:
14188 {
14189 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14190 Py_UCS1 *data = writer->data;
14191
Christian Heimesf051e432016-09-13 20:22:02 +020014192 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014193 break;
14194 }
14195 case PyUnicode_2BYTE_KIND:
14196 {
14197 _PyUnicode_CONVERT_BYTES(
14198 Py_UCS1, Py_UCS2,
14199 ascii, ascii + len,
14200 (Py_UCS2 *)writer->data + writer->pos);
14201 break;
14202 }
14203 case PyUnicode_4BYTE_KIND:
14204 {
14205 _PyUnicode_CONVERT_BYTES(
14206 Py_UCS1, Py_UCS4,
14207 ascii, ascii + len,
14208 (Py_UCS4 *)writer->data + writer->pos);
14209 break;
14210 }
14211 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014212 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014213 }
14214
14215 writer->pos += len;
14216 return 0;
14217}
14218
14219int
14220_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14221 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014222{
14223 Py_UCS4 maxchar;
14224
Andy Lestere6be9b52020-02-11 20:28:35 -060014225 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014226 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14227 return -1;
14228 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14229 writer->pos += len;
14230 return 0;
14231}
14232
Victor Stinnerd3f08822012-05-29 12:57:52 +020014233PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014234_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014235{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014236 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014237
Victor Stinnerd3f08822012-05-29 12:57:52 +020014238 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014239 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014240 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014241 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014242
14243 str = writer->buffer;
14244 writer->buffer = NULL;
14245
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014246 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014247 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14248 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014249 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014250
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014251 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14252 PyObject *str2;
14253 str2 = resize_compact(str, writer->pos);
14254 if (str2 == NULL) {
14255 Py_DECREF(str);
14256 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014257 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014258 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014259 }
14260
Victor Stinner15a0bd32013-07-08 22:29:55 +020014261 assert(_PyUnicode_CheckConsistency(str, 1));
14262 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014263}
14264
Victor Stinnerd3f08822012-05-29 12:57:52 +020014265void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014266_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014267{
14268 Py_CLEAR(writer->buffer);
14269}
14270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014271#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014272
14273PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014274 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014275\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014276Return a formatted version of S, using substitutions from args and kwargs.\n\
14277The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014278
Eric Smith27bbca62010-11-04 17:06:58 +000014279PyDoc_STRVAR(format_map__doc__,
14280 "S.format_map(mapping) -> str\n\
14281\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014282Return a formatted version of S, using substitutions from mapping.\n\
14283The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014284
INADA Naoki3ae20562017-01-16 20:41:20 +090014285/*[clinic input]
14286str.__format__ as unicode___format__
14287
14288 format_spec: unicode
14289 /
14290
14291Return a formatted version of the string as described by format_spec.
14292[clinic start generated code]*/
14293
Eric Smith4a7d76d2008-05-30 18:10:19 +000014294static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014295unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014296/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014297{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014298 _PyUnicodeWriter writer;
14299 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014300
Victor Stinnerd3f08822012-05-29 12:57:52 +020014301 if (PyUnicode_READY(self) == -1)
14302 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014303 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014304 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14305 self, format_spec, 0,
14306 PyUnicode_GET_LENGTH(format_spec));
14307 if (ret == -1) {
14308 _PyUnicodeWriter_Dealloc(&writer);
14309 return NULL;
14310 }
14311 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014312}
14313
INADA Naoki3ae20562017-01-16 20:41:20 +090014314/*[clinic input]
14315str.__sizeof__ as unicode_sizeof
14316
14317Return the size of the string in memory, in bytes.
14318[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014319
14320static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014321unicode_sizeof_impl(PyObject *self)
14322/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014323{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014324 Py_ssize_t size;
14325
14326 /* If it's a compact object, account for base structure +
14327 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014328 if (PyUnicode_IS_COMPACT_ASCII(self))
14329 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14330 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014331 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014332 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014333 else {
14334 /* If it is a two-block object, account for base object, and
14335 for character block if present. */
14336 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014337 if (_PyUnicode_DATA_ANY(self))
14338 size += (PyUnicode_GET_LENGTH(self) + 1) *
14339 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014340 }
14341 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014342 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014343 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14344 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14345 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14346 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014347
14348 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014349}
14350
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014351static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014352unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014353{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014354 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014355 if (!copy)
14356 return NULL;
14357 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014358}
14359
Guido van Rossumd57fd912000-03-10 22:53:23 +000014360static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014361 UNICODE_ENCODE_METHODDEF
14362 UNICODE_REPLACE_METHODDEF
14363 UNICODE_SPLIT_METHODDEF
14364 UNICODE_RSPLIT_METHODDEF
14365 UNICODE_JOIN_METHODDEF
14366 UNICODE_CAPITALIZE_METHODDEF
14367 UNICODE_CASEFOLD_METHODDEF
14368 UNICODE_TITLE_METHODDEF
14369 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014370 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014371 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014372 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014373 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014374 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014375 UNICODE_LJUST_METHODDEF
14376 UNICODE_LOWER_METHODDEF
14377 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014378 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14379 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014380 UNICODE_RJUST_METHODDEF
14381 UNICODE_RSTRIP_METHODDEF
14382 UNICODE_RPARTITION_METHODDEF
14383 UNICODE_SPLITLINES_METHODDEF
14384 UNICODE_STRIP_METHODDEF
14385 UNICODE_SWAPCASE_METHODDEF
14386 UNICODE_TRANSLATE_METHODDEF
14387 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014388 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14389 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014390 UNICODE_REMOVEPREFIX_METHODDEF
14391 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014392 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014393 UNICODE_ISLOWER_METHODDEF
14394 UNICODE_ISUPPER_METHODDEF
14395 UNICODE_ISTITLE_METHODDEF
14396 UNICODE_ISSPACE_METHODDEF
14397 UNICODE_ISDECIMAL_METHODDEF
14398 UNICODE_ISDIGIT_METHODDEF
14399 UNICODE_ISNUMERIC_METHODDEF
14400 UNICODE_ISALPHA_METHODDEF
14401 UNICODE_ISALNUM_METHODDEF
14402 UNICODE_ISIDENTIFIER_METHODDEF
14403 UNICODE_ISPRINTABLE_METHODDEF
14404 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014405 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014406 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014407 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014408 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014409 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014410#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014411 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014412 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014413#endif
14414
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014415 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014416 {NULL, NULL}
14417};
14418
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014419static PyObject *
14420unicode_mod(PyObject *v, PyObject *w)
14421{
Brian Curtindfc80e32011-08-10 20:28:54 -050014422 if (!PyUnicode_Check(v))
14423 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014424 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014425}
14426
14427static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014428 0, /*nb_add*/
14429 0, /*nb_subtract*/
14430 0, /*nb_multiply*/
14431 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014432};
14433
Guido van Rossumd57fd912000-03-10 22:53:23 +000014434static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014435 (lenfunc) unicode_length, /* sq_length */
14436 PyUnicode_Concat, /* sq_concat */
14437 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14438 (ssizeargfunc) unicode_getitem, /* sq_item */
14439 0, /* sq_slice */
14440 0, /* sq_ass_item */
14441 0, /* sq_ass_slice */
14442 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014443};
14444
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014445static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014446unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014448 if (PyUnicode_READY(self) == -1)
14449 return NULL;
14450
Victor Stinnera15e2602020-04-08 02:01:56 +020014451 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014452 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014453 if (i == -1 && PyErr_Occurred())
14454 return NULL;
14455 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014456 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014457 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014458 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014459 Py_ssize_t start, stop, step, slicelength, i;
14460 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014461 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014462 const void *src_data;
14463 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014464 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014465 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014466
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014467 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014468 return NULL;
14469 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014470 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14471 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014472
14473 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014474 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014475 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014476 slicelength == PyUnicode_GET_LENGTH(self)) {
14477 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014478 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014479 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014480 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014481 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014482 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014483 src_kind = PyUnicode_KIND(self);
14484 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014485 if (!PyUnicode_IS_ASCII(self)) {
14486 kind_limit = kind_maxchar_limit(src_kind);
14487 max_char = 0;
14488 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14489 ch = PyUnicode_READ(src_kind, src_data, cur);
14490 if (ch > max_char) {
14491 max_char = ch;
14492 if (max_char >= kind_limit)
14493 break;
14494 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014495 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014496 }
Victor Stinner55c99112011-10-13 01:17:06 +020014497 else
14498 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014499 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014500 if (result == NULL)
14501 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014502 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014503 dest_data = PyUnicode_DATA(result);
14504
14505 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014506 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14507 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014508 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014509 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014510 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014511 } else {
14512 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14513 return NULL;
14514 }
14515}
14516
14517static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014518 (lenfunc)unicode_length, /* mp_length */
14519 (binaryfunc)unicode_subscript, /* mp_subscript */
14520 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014521};
14522
Guido van Rossumd57fd912000-03-10 22:53:23 +000014523
Guido van Rossumd57fd912000-03-10 22:53:23 +000014524/* Helpers for PyUnicode_Format() */
14525
Victor Stinnera47082312012-10-04 02:19:54 +020014526struct unicode_formatter_t {
14527 PyObject *args;
14528 int args_owned;
14529 Py_ssize_t arglen, argidx;
14530 PyObject *dict;
14531
14532 enum PyUnicode_Kind fmtkind;
14533 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014534 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014535 PyObject *fmtstr;
14536
14537 _PyUnicodeWriter writer;
14538};
14539
14540struct unicode_format_arg_t {
14541 Py_UCS4 ch;
14542 int flags;
14543 Py_ssize_t width;
14544 int prec;
14545 int sign;
14546};
14547
Guido van Rossumd57fd912000-03-10 22:53:23 +000014548static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014549unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014550{
Victor Stinnera47082312012-10-04 02:19:54 +020014551 Py_ssize_t argidx = ctx->argidx;
14552
14553 if (argidx < ctx->arglen) {
14554 ctx->argidx++;
14555 if (ctx->arglen < 0)
14556 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014557 else
Victor Stinnera47082312012-10-04 02:19:54 +020014558 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014559 }
14560 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014561 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014562 return NULL;
14563}
14564
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014565/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014566
Victor Stinnera47082312012-10-04 02:19:54 +020014567/* Format a float into the writer if the writer is not NULL, or into *p_output
14568 otherwise.
14569
14570 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014571static int
Victor Stinnera47082312012-10-04 02:19:54 +020014572formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14573 PyObject **p_output,
14574 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014575{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014576 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014577 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014578 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014579 int prec;
14580 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014581
Guido van Rossumd57fd912000-03-10 22:53:23 +000014582 x = PyFloat_AsDouble(v);
14583 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014584 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014585
Victor Stinnera47082312012-10-04 02:19:54 +020014586 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014587 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014588 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014589
Victor Stinnera47082312012-10-04 02:19:54 +020014590 if (arg->flags & F_ALT)
14591 dtoa_flags = Py_DTSF_ALT;
14592 else
14593 dtoa_flags = 0;
14594 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014595 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014596 return -1;
14597 len = strlen(p);
14598 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014599 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014600 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014601 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014602 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014603 }
14604 else
14605 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014606 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014607 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014608}
14609
Victor Stinnerd0880d52012-04-27 23:40:13 +020014610/* formatlong() emulates the format codes d, u, o, x and X, and
14611 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14612 * Python's regular ints.
14613 * Return value: a new PyUnicodeObject*, or NULL if error.
14614 * The output string is of the form
14615 * "-"? ("0x" | "0X")? digit+
14616 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14617 * set in flags. The case of hex digits will be correct,
14618 * There will be at least prec digits, zero-filled on the left if
14619 * necessary to get that many.
14620 * val object to be converted
14621 * flags bitmask of format flags; only F_ALT is looked at
14622 * prec minimum number of digits; 0-fill on left if needed
14623 * type a character in [duoxX]; u acts the same as d
14624 *
14625 * CAUTION: o, x and X conversions on regular ints can never
14626 * produce a '-' sign, but can for Python's unbounded ints.
14627 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014628PyObject *
14629_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014630{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014631 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014632 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014633 Py_ssize_t i;
14634 int sign; /* 1 if '-', else 0 */
14635 int len; /* number of characters */
14636 Py_ssize_t llen;
14637 int numdigits; /* len == numnondigits + numdigits */
14638 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014639
Victor Stinnerd0880d52012-04-27 23:40:13 +020014640 /* Avoid exceeding SSIZE_T_MAX */
14641 if (prec > INT_MAX-3) {
14642 PyErr_SetString(PyExc_OverflowError,
14643 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014644 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014645 }
14646
14647 assert(PyLong_Check(val));
14648
14649 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014650 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014651 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014652 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014653 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014654 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014655 /* int and int subclasses should print numerically when a numeric */
14656 /* format code is used (see issue18780) */
14657 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014658 break;
14659 case 'o':
14660 numnondigits = 2;
14661 result = PyNumber_ToBase(val, 8);
14662 break;
14663 case 'x':
14664 case 'X':
14665 numnondigits = 2;
14666 result = PyNumber_ToBase(val, 16);
14667 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014668 }
14669 if (!result)
14670 return NULL;
14671
14672 assert(unicode_modifiable(result));
14673 assert(PyUnicode_IS_READY(result));
14674 assert(PyUnicode_IS_ASCII(result));
14675
14676 /* To modify the string in-place, there can only be one reference. */
14677 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014678 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014679 PyErr_BadInternalCall();
14680 return NULL;
14681 }
14682 buf = PyUnicode_DATA(result);
14683 llen = PyUnicode_GET_LENGTH(result);
14684 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014685 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014686 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014687 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014688 return NULL;
14689 }
14690 len = (int)llen;
14691 sign = buf[0] == '-';
14692 numnondigits += sign;
14693 numdigits = len - numnondigits;
14694 assert(numdigits > 0);
14695
14696 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014697 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014698 (type == 'o' || type == 'x' || type == 'X'))) {
14699 assert(buf[sign] == '0');
14700 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14701 buf[sign+1] == 'o');
14702 numnondigits -= 2;
14703 buf += 2;
14704 len -= 2;
14705 if (sign)
14706 buf[0] = '-';
14707 assert(len == numnondigits + numdigits);
14708 assert(numdigits > 0);
14709 }
14710
14711 /* Fill with leading zeroes to meet minimum width. */
14712 if (prec > numdigits) {
14713 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14714 numnondigits + prec);
14715 char *b1;
14716 if (!r1) {
14717 Py_DECREF(result);
14718 return NULL;
14719 }
14720 b1 = PyBytes_AS_STRING(r1);
14721 for (i = 0; i < numnondigits; ++i)
14722 *b1++ = *buf++;
14723 for (i = 0; i < prec - numdigits; i++)
14724 *b1++ = '0';
14725 for (i = 0; i < numdigits; i++)
14726 *b1++ = *buf++;
14727 *b1 = '\0';
14728 Py_DECREF(result);
14729 result = r1;
14730 buf = PyBytes_AS_STRING(result);
14731 len = numnondigits + prec;
14732 }
14733
14734 /* Fix up case for hex conversions. */
14735 if (type == 'X') {
14736 /* Need to convert all lower case letters to upper case.
14737 and need to convert 0x to 0X (and -0x to -0X). */
14738 for (i = 0; i < len; i++)
14739 if (buf[i] >= 'a' && buf[i] <= 'x')
14740 buf[i] -= 'a'-'A';
14741 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014742 if (!PyUnicode_Check(result)
14743 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014744 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014745 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014746 Py_DECREF(result);
14747 result = unicode;
14748 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014749 else if (len != PyUnicode_GET_LENGTH(result)) {
14750 if (PyUnicode_Resize(&result, len) < 0)
14751 Py_CLEAR(result);
14752 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014753 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014754}
14755
Ethan Furmandf3ed242014-01-05 06:50:30 -080014756/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014757 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014758 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014759 * -1 and raise an exception on error */
14760static int
Victor Stinnera47082312012-10-04 02:19:54 +020014761mainformatlong(PyObject *v,
14762 struct unicode_format_arg_t *arg,
14763 PyObject **p_output,
14764 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014765{
14766 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014767 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014768
14769 if (!PyNumber_Check(v))
14770 goto wrongtype;
14771
Ethan Furman9ab74802014-03-21 06:38:46 -070014772 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014773 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014774 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014775 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014776 }
14777 else {
14778 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014779 }
14780 if (iobj == NULL ) {
14781 if (PyErr_ExceptionMatches(PyExc_TypeError))
14782 goto wrongtype;
14783 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014784 }
14785 assert(PyLong_Check(iobj));
14786 }
14787 else {
14788 iobj = v;
14789 Py_INCREF(iobj);
14790 }
14791
14792 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014793 && arg->width == -1 && arg->prec == -1
14794 && !(arg->flags & (F_SIGN | F_BLANK))
14795 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014796 {
14797 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014798 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014799 int base;
14800
Victor Stinnera47082312012-10-04 02:19:54 +020014801 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014802 {
14803 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014804 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014805 case 'd':
14806 case 'i':
14807 case 'u':
14808 base = 10;
14809 break;
14810 case 'o':
14811 base = 8;
14812 break;
14813 case 'x':
14814 case 'X':
14815 base = 16;
14816 break;
14817 }
14818
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014819 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14820 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014821 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014822 }
14823 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014824 return 1;
14825 }
14826
Ethan Furmanb95b5612015-01-23 20:05:18 -080014827 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014828 Py_DECREF(iobj);
14829 if (res == NULL)
14830 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014831 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014832 return 0;
14833
14834wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014835 switch(type)
14836 {
14837 case 'o':
14838 case 'x':
14839 case 'X':
14840 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014841 "%%%c format: an integer is required, "
14842 "not %.200s",
14843 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014844 break;
14845 default:
14846 PyErr_Format(PyExc_TypeError,
Serhiy Storchakae2ec0b22020-10-09 14:14:37 +030014847 "%%%c format: a real number is required, "
Victor Stinner998b8062018-09-12 00:23:25 +020014848 "not %.200s",
14849 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014850 break;
14851 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014852 return -1;
14853}
14854
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014855static Py_UCS4
14856formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014857{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014858 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014859 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014860 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014861 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014862 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014863 goto onError;
14864 }
14865 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014866 int overflow;
14867 long x = PyLong_AsLongAndOverflow(v, &overflow);
14868 if (x == -1 && PyErr_Occurred()) {
14869 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014870 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014871 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014872 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014873 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014874
Victor Stinner8faf8212011-12-08 22:14:11 +010014875 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014876 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014877 PyErr_SetString(PyExc_OverflowError,
14878 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014879 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014880 }
14881
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014882 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014883 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014884
Benjamin Peterson29060642009-01-31 22:14:21 +000014885 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014886 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014887 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014888 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014889}
14890
Victor Stinnera47082312012-10-04 02:19:54 +020014891/* Parse options of an argument: flags, width, precision.
14892 Handle also "%(name)" syntax.
14893
14894 Return 0 if the argument has been formatted into arg->str.
14895 Return 1 if the argument has been written into ctx->writer,
14896 Raise an exception and return -1 on error. */
14897static int
14898unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14899 struct unicode_format_arg_t *arg)
14900{
14901#define FORMAT_READ(ctx) \
14902 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14903
14904 PyObject *v;
14905
Victor Stinnera47082312012-10-04 02:19:54 +020014906 if (arg->ch == '(') {
14907 /* Get argument value from a dictionary. Example: "%(name)s". */
14908 Py_ssize_t keystart;
14909 Py_ssize_t keylen;
14910 PyObject *key;
14911 int pcount = 1;
14912
14913 if (ctx->dict == NULL) {
14914 PyErr_SetString(PyExc_TypeError,
14915 "format requires a mapping");
14916 return -1;
14917 }
14918 ++ctx->fmtpos;
14919 --ctx->fmtcnt;
14920 keystart = ctx->fmtpos;
14921 /* Skip over balanced parentheses */
14922 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14923 arg->ch = FORMAT_READ(ctx);
14924 if (arg->ch == ')')
14925 --pcount;
14926 else if (arg->ch == '(')
14927 ++pcount;
14928 ctx->fmtpos++;
14929 }
14930 keylen = ctx->fmtpos - keystart - 1;
14931 if (ctx->fmtcnt < 0 || pcount > 0) {
14932 PyErr_SetString(PyExc_ValueError,
14933 "incomplete format key");
14934 return -1;
14935 }
14936 key = PyUnicode_Substring(ctx->fmtstr,
14937 keystart, keystart + keylen);
14938 if (key == NULL)
14939 return -1;
14940 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014941 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014942 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014943 }
14944 ctx->args = PyObject_GetItem(ctx->dict, key);
14945 Py_DECREF(key);
14946 if (ctx->args == NULL)
14947 return -1;
14948 ctx->args_owned = 1;
14949 ctx->arglen = -1;
14950 ctx->argidx = -2;
14951 }
14952
14953 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014954 while (--ctx->fmtcnt >= 0) {
14955 arg->ch = FORMAT_READ(ctx);
14956 ctx->fmtpos++;
14957 switch (arg->ch) {
14958 case '-': arg->flags |= F_LJUST; continue;
14959 case '+': arg->flags |= F_SIGN; continue;
14960 case ' ': arg->flags |= F_BLANK; continue;
14961 case '#': arg->flags |= F_ALT; continue;
14962 case '0': arg->flags |= F_ZERO; continue;
14963 }
14964 break;
14965 }
14966
14967 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014968 if (arg->ch == '*') {
14969 v = unicode_format_getnextarg(ctx);
14970 if (v == NULL)
14971 return -1;
14972 if (!PyLong_Check(v)) {
14973 PyErr_SetString(PyExc_TypeError,
14974 "* wants int");
14975 return -1;
14976 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014977 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014978 if (arg->width == -1 && PyErr_Occurred())
14979 return -1;
14980 if (arg->width < 0) {
14981 arg->flags |= F_LJUST;
14982 arg->width = -arg->width;
14983 }
14984 if (--ctx->fmtcnt >= 0) {
14985 arg->ch = FORMAT_READ(ctx);
14986 ctx->fmtpos++;
14987 }
14988 }
14989 else if (arg->ch >= '0' && arg->ch <= '9') {
14990 arg->width = arg->ch - '0';
14991 while (--ctx->fmtcnt >= 0) {
14992 arg->ch = FORMAT_READ(ctx);
14993 ctx->fmtpos++;
14994 if (arg->ch < '0' || arg->ch > '9')
14995 break;
14996 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14997 mixing signed and unsigned comparison. Since arg->ch is between
14998 '0' and '9', casting to int is safe. */
14999 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15000 PyErr_SetString(PyExc_ValueError,
15001 "width too big");
15002 return -1;
15003 }
15004 arg->width = arg->width*10 + (arg->ch - '0');
15005 }
15006 }
15007
15008 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020015009 if (arg->ch == '.') {
15010 arg->prec = 0;
15011 if (--ctx->fmtcnt >= 0) {
15012 arg->ch = FORMAT_READ(ctx);
15013 ctx->fmtpos++;
15014 }
15015 if (arg->ch == '*') {
15016 v = unicode_format_getnextarg(ctx);
15017 if (v == NULL)
15018 return -1;
15019 if (!PyLong_Check(v)) {
15020 PyErr_SetString(PyExc_TypeError,
15021 "* wants int");
15022 return -1;
15023 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015024 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015025 if (arg->prec == -1 && PyErr_Occurred())
15026 return -1;
15027 if (arg->prec < 0)
15028 arg->prec = 0;
15029 if (--ctx->fmtcnt >= 0) {
15030 arg->ch = FORMAT_READ(ctx);
15031 ctx->fmtpos++;
15032 }
15033 }
15034 else if (arg->ch >= '0' && arg->ch <= '9') {
15035 arg->prec = arg->ch - '0';
15036 while (--ctx->fmtcnt >= 0) {
15037 arg->ch = FORMAT_READ(ctx);
15038 ctx->fmtpos++;
15039 if (arg->ch < '0' || arg->ch > '9')
15040 break;
15041 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15042 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015043 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015044 return -1;
15045 }
15046 arg->prec = arg->prec*10 + (arg->ch - '0');
15047 }
15048 }
15049 }
15050
15051 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15052 if (ctx->fmtcnt >= 0) {
15053 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15054 if (--ctx->fmtcnt >= 0) {
15055 arg->ch = FORMAT_READ(ctx);
15056 ctx->fmtpos++;
15057 }
15058 }
15059 }
15060 if (ctx->fmtcnt < 0) {
15061 PyErr_SetString(PyExc_ValueError,
15062 "incomplete format");
15063 return -1;
15064 }
15065 return 0;
15066
15067#undef FORMAT_READ
15068}
15069
15070/* Format one argument. Supported conversion specifiers:
15071
15072 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015073 - "i", "d", "u": int or float
15074 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015075 - "e", "E", "f", "F", "g", "G": float
15076 - "c": int or str (1 character)
15077
Victor Stinner8dbd4212012-12-04 09:30:24 +010015078 When possible, the output is written directly into the Unicode writer
15079 (ctx->writer). A string is created when padding is required.
15080
Victor Stinnera47082312012-10-04 02:19:54 +020015081 Return 0 if the argument has been formatted into *p_str,
15082 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015083 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015084static int
15085unicode_format_arg_format(struct unicode_formatter_t *ctx,
15086 struct unicode_format_arg_t *arg,
15087 PyObject **p_str)
15088{
15089 PyObject *v;
15090 _PyUnicodeWriter *writer = &ctx->writer;
15091
15092 if (ctx->fmtcnt == 0)
15093 ctx->writer.overallocate = 0;
15094
Victor Stinnera47082312012-10-04 02:19:54 +020015095 v = unicode_format_getnextarg(ctx);
15096 if (v == NULL)
15097 return -1;
15098
Victor Stinnera47082312012-10-04 02:19:54 +020015099
15100 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015101 case 's':
15102 case 'r':
15103 case 'a':
15104 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15105 /* Fast path */
15106 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15107 return -1;
15108 return 1;
15109 }
15110
15111 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15112 *p_str = v;
15113 Py_INCREF(*p_str);
15114 }
15115 else {
15116 if (arg->ch == 's')
15117 *p_str = PyObject_Str(v);
15118 else if (arg->ch == 'r')
15119 *p_str = PyObject_Repr(v);
15120 else
15121 *p_str = PyObject_ASCII(v);
15122 }
15123 break;
15124
15125 case 'i':
15126 case 'd':
15127 case 'u':
15128 case 'o':
15129 case 'x':
15130 case 'X':
15131 {
15132 int ret = mainformatlong(v, arg, p_str, writer);
15133 if (ret != 0)
15134 return ret;
15135 arg->sign = 1;
15136 break;
15137 }
15138
15139 case 'e':
15140 case 'E':
15141 case 'f':
15142 case 'F':
15143 case 'g':
15144 case 'G':
15145 if (arg->width == -1 && arg->prec == -1
15146 && !(arg->flags & (F_SIGN | F_BLANK)))
15147 {
15148 /* Fast path */
15149 if (formatfloat(v, arg, NULL, writer) == -1)
15150 return -1;
15151 return 1;
15152 }
15153
15154 arg->sign = 1;
15155 if (formatfloat(v, arg, p_str, NULL) == -1)
15156 return -1;
15157 break;
15158
15159 case 'c':
15160 {
15161 Py_UCS4 ch = formatchar(v);
15162 if (ch == (Py_UCS4) -1)
15163 return -1;
15164 if (arg->width == -1 && arg->prec == -1) {
15165 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015166 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015167 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015168 return 1;
15169 }
15170 *p_str = PyUnicode_FromOrdinal(ch);
15171 break;
15172 }
15173
15174 default:
15175 PyErr_Format(PyExc_ValueError,
15176 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015177 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015178 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15179 (int)arg->ch,
15180 ctx->fmtpos - 1);
15181 return -1;
15182 }
15183 if (*p_str == NULL)
15184 return -1;
15185 assert (PyUnicode_Check(*p_str));
15186 return 0;
15187}
15188
15189static int
15190unicode_format_arg_output(struct unicode_formatter_t *ctx,
15191 struct unicode_format_arg_t *arg,
15192 PyObject *str)
15193{
15194 Py_ssize_t len;
15195 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015196 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015197 Py_ssize_t pindex;
15198 Py_UCS4 signchar;
15199 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015200 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015201 Py_ssize_t sublen;
15202 _PyUnicodeWriter *writer = &ctx->writer;
15203 Py_UCS4 fill;
15204
15205 fill = ' ';
15206 if (arg->sign && arg->flags & F_ZERO)
15207 fill = '0';
15208
15209 if (PyUnicode_READY(str) == -1)
15210 return -1;
15211
15212 len = PyUnicode_GET_LENGTH(str);
15213 if ((arg->width == -1 || arg->width <= len)
15214 && (arg->prec == -1 || arg->prec >= len)
15215 && !(arg->flags & (F_SIGN | F_BLANK)))
15216 {
15217 /* Fast path */
15218 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15219 return -1;
15220 return 0;
15221 }
15222
15223 /* Truncate the string for "s", "r" and "a" formats
15224 if the precision is set */
15225 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15226 if (arg->prec >= 0 && len > arg->prec)
15227 len = arg->prec;
15228 }
15229
15230 /* Adjust sign and width */
15231 kind = PyUnicode_KIND(str);
15232 pbuf = PyUnicode_DATA(str);
15233 pindex = 0;
15234 signchar = '\0';
15235 if (arg->sign) {
15236 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15237 if (ch == '-' || ch == '+') {
15238 signchar = ch;
15239 len--;
15240 pindex++;
15241 }
15242 else if (arg->flags & F_SIGN)
15243 signchar = '+';
15244 else if (arg->flags & F_BLANK)
15245 signchar = ' ';
15246 else
15247 arg->sign = 0;
15248 }
15249 if (arg->width < len)
15250 arg->width = len;
15251
15252 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015253 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015254 if (!(arg->flags & F_LJUST)) {
15255 if (arg->sign) {
15256 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015257 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015258 }
15259 else {
15260 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015261 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015262 }
15263 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015264 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15265 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015266 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015267 }
15268
Victor Stinnera47082312012-10-04 02:19:54 +020015269 buflen = arg->width;
15270 if (arg->sign && len == arg->width)
15271 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015272 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015273 return -1;
15274
15275 /* Write the sign if needed */
15276 if (arg->sign) {
15277 if (fill != ' ') {
15278 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15279 writer->pos += 1;
15280 }
15281 if (arg->width > len)
15282 arg->width--;
15283 }
15284
15285 /* Write the numeric prefix for "x", "X" and "o" formats
15286 if the alternate form is used.
15287 For example, write "0x" for the "%#x" format. */
15288 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15289 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15290 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15291 if (fill != ' ') {
15292 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15293 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15294 writer->pos += 2;
15295 pindex += 2;
15296 }
15297 arg->width -= 2;
15298 if (arg->width < 0)
15299 arg->width = 0;
15300 len -= 2;
15301 }
15302
15303 /* Pad left with the fill character if needed */
15304 if (arg->width > len && !(arg->flags & F_LJUST)) {
15305 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015306 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015307 writer->pos += sublen;
15308 arg->width = len;
15309 }
15310
15311 /* If padding with spaces: write sign if needed and/or numeric prefix if
15312 the alternate form is used */
15313 if (fill == ' ') {
15314 if (arg->sign) {
15315 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15316 writer->pos += 1;
15317 }
15318 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15319 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15320 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15321 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15322 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15323 writer->pos += 2;
15324 pindex += 2;
15325 }
15326 }
15327
15328 /* Write characters */
15329 if (len) {
15330 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15331 str, pindex, len);
15332 writer->pos += len;
15333 }
15334
15335 /* Pad right with the fill character if needed */
15336 if (arg->width > len) {
15337 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015338 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015339 writer->pos += sublen;
15340 }
15341 return 0;
15342}
15343
15344/* Helper of PyUnicode_Format(): format one arg.
15345 Return 0 on success, raise an exception and return -1 on error. */
15346static int
15347unicode_format_arg(struct unicode_formatter_t *ctx)
15348{
15349 struct unicode_format_arg_t arg;
15350 PyObject *str;
15351 int ret;
15352
Victor Stinner8dbd4212012-12-04 09:30:24 +010015353 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015354 if (arg.ch == '%') {
15355 ctx->fmtpos++;
15356 ctx->fmtcnt--;
15357 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15358 return -1;
15359 return 0;
15360 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015361 arg.flags = 0;
15362 arg.width = -1;
15363 arg.prec = -1;
15364 arg.sign = 0;
15365 str = NULL;
15366
Victor Stinnera47082312012-10-04 02:19:54 +020015367 ret = unicode_format_arg_parse(ctx, &arg);
15368 if (ret == -1)
15369 return -1;
15370
15371 ret = unicode_format_arg_format(ctx, &arg, &str);
15372 if (ret == -1)
15373 return -1;
15374
15375 if (ret != 1) {
15376 ret = unicode_format_arg_output(ctx, &arg, str);
15377 Py_DECREF(str);
15378 if (ret == -1)
15379 return -1;
15380 }
15381
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015382 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015383 PyErr_SetString(PyExc_TypeError,
15384 "not all arguments converted during string formatting");
15385 return -1;
15386 }
15387 return 0;
15388}
15389
Alexander Belopolsky40018472011-02-26 01:02:56 +000015390PyObject *
15391PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015392{
Victor Stinnera47082312012-10-04 02:19:54 +020015393 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015394
Guido van Rossumd57fd912000-03-10 22:53:23 +000015395 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015396 PyErr_BadInternalCall();
15397 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015398 }
Victor Stinnera47082312012-10-04 02:19:54 +020015399
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015400 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015401 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015402
15403 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015404 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15405 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15406 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15407 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015408
Victor Stinner8f674cc2013-04-17 23:02:17 +020015409 _PyUnicodeWriter_Init(&ctx.writer);
15410 ctx.writer.min_length = ctx.fmtcnt + 100;
15411 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015412
Guido van Rossumd57fd912000-03-10 22:53:23 +000015413 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015414 ctx.arglen = PyTuple_Size(args);
15415 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015416 }
15417 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015418 ctx.arglen = -1;
15419 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015420 }
Victor Stinnera47082312012-10-04 02:19:54 +020015421 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015422 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015423 ctx.dict = args;
15424 else
15425 ctx.dict = NULL;
15426 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015427
Victor Stinnera47082312012-10-04 02:19:54 +020015428 while (--ctx.fmtcnt >= 0) {
15429 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015430 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015431
15432 nonfmtpos = ctx.fmtpos++;
15433 while (ctx.fmtcnt >= 0 &&
15434 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15435 ctx.fmtpos++;
15436 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 }
Victor Stinnera47082312012-10-04 02:19:54 +020015438 if (ctx.fmtcnt < 0) {
15439 ctx.fmtpos--;
15440 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015441 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015442
Victor Stinnercfc4c132013-04-03 01:48:39 +020015443 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15444 nonfmtpos, ctx.fmtpos) < 0)
15445 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015446 }
15447 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015448 ctx.fmtpos++;
15449 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015450 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015451 }
15452 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015453
Victor Stinnera47082312012-10-04 02:19:54 +020015454 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015455 PyErr_SetString(PyExc_TypeError,
15456 "not all arguments converted during string formatting");
15457 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015458 }
15459
Victor Stinnera47082312012-10-04 02:19:54 +020015460 if (ctx.args_owned) {
15461 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015462 }
Victor Stinnera47082312012-10-04 02:19:54 +020015463 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015464
Benjamin Peterson29060642009-01-31 22:14:21 +000015465 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015466 _PyUnicodeWriter_Dealloc(&ctx.writer);
15467 if (ctx.args_owned) {
15468 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015469 }
15470 return NULL;
15471}
15472
Jeremy Hylton938ace62002-07-17 16:30:39 +000015473static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015474unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15475
15476/*[clinic input]
15477@classmethod
15478str.__new__ as unicode_new
15479
15480 object as x: object = NULL
15481 encoding: str = NULL
15482 errors: str = NULL
15483
15484[clinic start generated code]*/
Guido van Rossume023fe02001-08-30 03:12:59 +000015485
Tim Peters6d6c1a32001-08-02 04:15:00 +000015486static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015487unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15488 const char *errors)
15489/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
Tim Peters6d6c1a32001-08-02 04:15:00 +000015490{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015491 PyObject *unicode;
15492 if (x == NULL) {
15493 unicode = unicode_new_empty();
15494 }
15495 else if (encoding == NULL && errors == NULL) {
15496 unicode = PyObject_Str(x);
15497 }
15498 else {
15499 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15500 }
Tim Peters6d6c1a32001-08-02 04:15:00 +000015501
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015502 if (unicode != NULL && type != &PyUnicode_Type) {
15503 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15504 }
15505 return unicode;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015506}
15507
Guido van Rossume023fe02001-08-30 03:12:59 +000015508static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015509unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
Guido van Rossume023fe02001-08-30 03:12:59 +000015510{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015511 PyObject *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015512 Py_ssize_t length, char_size;
15513 int share_wstr, share_utf8;
15514 unsigned int kind;
15515 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015516
Benjamin Peterson14339b62009-01-31 16:36:08 +000015517 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner910337b2011-10-03 03:20:16 +020015518 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015519 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015520 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015521 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015522
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015523 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015524 if (self == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015525 return NULL;
15526 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015527 kind = PyUnicode_KIND(unicode);
15528 length = PyUnicode_GET_LENGTH(unicode);
15529
15530 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015531#ifdef Py_DEBUG
15532 _PyUnicode_HASH(self) = -1;
15533#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015534 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015535#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015536 _PyUnicode_STATE(self).interned = 0;
15537 _PyUnicode_STATE(self).kind = kind;
15538 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015539 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015540 _PyUnicode_STATE(self).ready = 1;
15541 _PyUnicode_WSTR(self) = NULL;
15542 _PyUnicode_UTF8_LENGTH(self) = 0;
15543 _PyUnicode_UTF8(self) = NULL;
15544 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015545 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015546
15547 share_utf8 = 0;
15548 share_wstr = 0;
15549 if (kind == PyUnicode_1BYTE_KIND) {
15550 char_size = 1;
15551 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15552 share_utf8 = 1;
15553 }
15554 else if (kind == PyUnicode_2BYTE_KIND) {
15555 char_size = 2;
15556 if (sizeof(wchar_t) == 2)
15557 share_wstr = 1;
15558 }
15559 else {
15560 assert(kind == PyUnicode_4BYTE_KIND);
15561 char_size = 4;
15562 if (sizeof(wchar_t) == 4)
15563 share_wstr = 1;
15564 }
15565
15566 /* Ensure we won't overflow the length. */
15567 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15568 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015569 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015570 }
Victor Stinner32bd68c2020-12-01 10:37:39 +010015571 data = PyObject_Malloc((length + 1) * char_size);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015572 if (data == NULL) {
15573 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015574 goto onError;
15575 }
15576
Victor Stinnerc3c74152011-10-02 20:39:55 +020015577 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015578 if (share_utf8) {
15579 _PyUnicode_UTF8_LENGTH(self) = length;
15580 _PyUnicode_UTF8(self) = data;
15581 }
15582 if (share_wstr) {
15583 _PyUnicode_WSTR_LENGTH(self) = length;
15584 _PyUnicode_WSTR(self) = (wchar_t *)data;
15585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015586
Christian Heimesf051e432016-09-13 20:22:02 +020015587 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015588 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015589 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015590#ifdef Py_DEBUG
15591 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15592#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +010015593 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015594
15595onError:
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015596 Py_DECREF(self);
15597 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015598}
15599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015600PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015601"str(object='') -> str\n\
15602str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015603\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015604Create a new string object from the given object. If encoding or\n\
15605errors is specified, then the object must expose a data buffer\n\
15606that will be decoded using the given encoding and error handler.\n\
15607Otherwise, returns the result of object.__str__() (if defined)\n\
15608or repr(object).\n\
15609encoding defaults to sys.getdefaultencoding().\n\
15610errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015611
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015612static PyObject *unicode_iter(PyObject *seq);
15613
Guido van Rossumd57fd912000-03-10 22:53:23 +000015614PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015615 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015616 "str", /* tp_name */
15617 sizeof(PyUnicodeObject), /* tp_basicsize */
15618 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015619 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015620 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015621 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015622 0, /* tp_getattr */
15623 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015624 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015625 unicode_repr, /* tp_repr */
15626 &unicode_as_number, /* tp_as_number */
15627 &unicode_as_sequence, /* tp_as_sequence */
15628 &unicode_as_mapping, /* tp_as_mapping */
15629 (hashfunc) unicode_hash, /* tp_hash*/
15630 0, /* tp_call*/
15631 (reprfunc) unicode_str, /* tp_str */
15632 PyObject_GenericGetAttr, /* tp_getattro */
15633 0, /* tp_setattro */
15634 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015635 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015636 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15637 unicode_doc, /* tp_doc */
15638 0, /* tp_traverse */
15639 0, /* tp_clear */
15640 PyUnicode_RichCompare, /* tp_richcompare */
15641 0, /* tp_weaklistoffset */
15642 unicode_iter, /* tp_iter */
15643 0, /* tp_iternext */
15644 unicode_methods, /* tp_methods */
15645 0, /* tp_members */
15646 0, /* tp_getset */
15647 &PyBaseObject_Type, /* tp_base */
15648 0, /* tp_dict */
15649 0, /* tp_descr_get */
15650 0, /* tp_descr_set */
15651 0, /* tp_dictoffset */
15652 0, /* tp_init */
15653 0, /* tp_alloc */
15654 unicode_new, /* tp_new */
15655 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015656};
15657
15658/* Initialize the Unicode implementation */
15659
Victor Stinner331a6a52019-05-27 16:39:22 +020015660PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015661_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015662{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015663 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015664 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015665 0x000A, /* LINE FEED */
15666 0x000D, /* CARRIAGE RETURN */
15667 0x001C, /* FILE SEPARATOR */
15668 0x001D, /* GROUP SEPARATOR */
15669 0x001E, /* RECORD SEPARATOR */
15670 0x0085, /* NEXT LINE */
15671 0x2028, /* LINE SEPARATOR */
15672 0x2029, /* PARAGRAPH SEPARATOR */
15673 };
15674
Victor Stinner91698d82020-06-25 14:07:40 +020015675 struct _Py_unicode_state *state = &tstate->interp->unicode;
15676 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015677 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015678 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015679
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015680 if (_Py_IsMainInterpreter(tstate)) {
15681 /* initialize the linebreak bloom filter */
15682 bloom_linebreak = make_bloom_mask(
15683 PyUnicode_2BYTE_KIND, linebreak,
15684 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015685
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015686 if (PyType_Ready(&PyUnicode_Type) < 0) {
15687 return _PyStatus_ERR("Can't initialize unicode type");
15688 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015689
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015690 if (PyType_Ready(&EncodingMapType) < 0) {
15691 return _PyStatus_ERR("Can't initialize encoding map type");
15692 }
15693 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15694 return _PyStatus_ERR("Can't initialize field name iterator type");
15695 }
15696 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15697 return _PyStatus_ERR("Can't initialize formatter iter type");
15698 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015699 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015700 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015701}
15702
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015703
Walter Dörwald16807132007-05-25 13:52:07 +000015704void
15705PyUnicode_InternInPlace(PyObject **p)
15706{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015707 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015708#ifdef Py_DEBUG
15709 assert(s != NULL);
15710 assert(_PyUnicode_CHECK(s));
15711#else
Victor Stinner607b1022020-05-05 18:50:30 +020015712 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015713 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015714 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015715#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015716
Benjamin Peterson14339b62009-01-31 16:36:08 +000015717 /* If it's a subclass, we don't really know what putting
15718 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015719 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015720 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015721 }
15722
15723 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015724 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015725 }
15726
15727#ifdef INTERNED_STRINGS
Victor Stinner666ecfb2020-07-02 01:19:57 +020015728 if (PyUnicode_READY(s) == -1) {
15729 PyErr_Clear();
15730 return;
15731 }
15732
Benjamin Peterson14339b62009-01-31 16:36:08 +000015733 if (interned == NULL) {
15734 interned = PyDict_New();
15735 if (interned == NULL) {
15736 PyErr_Clear(); /* Don't leave an exception */
15737 return;
15738 }
15739 }
Victor Stinner607b1022020-05-05 18:50:30 +020015740
15741 PyObject *t;
Berker Peksagced8d4c2016-07-25 04:40:39 +030015742 t = PyDict_SetDefault(interned, s, s);
Victor Stinner607b1022020-05-05 18:50:30 +020015743
Berker Peksagced8d4c2016-07-25 04:40:39 +030015744 if (t == NULL) {
15745 PyErr_Clear();
15746 return;
15747 }
Victor Stinner607b1022020-05-05 18:50:30 +020015748
Berker Peksagced8d4c2016-07-25 04:40:39 +030015749 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015750 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015751 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015752 return;
15753 }
Victor Stinner607b1022020-05-05 18:50:30 +020015754
Victor Stinner3549ca32020-07-03 16:59:12 +020015755 /* The two references in interned dict (key and value) are not counted by
15756 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15757 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015758 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015759 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner7f413a52020-09-23 14:05:32 +020015760#else
15761 // PyDict expects that interned strings have their hash
15762 // (PyASCIIObject.hash) already computed.
15763 (void)unicode_hash(s);
Victor Stinner607b1022020-05-05 18:50:30 +020015764#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015765}
15766
15767void
15768PyUnicode_InternImmortal(PyObject **p)
15769{
Victor Stinner583ee5a2020-10-02 14:49:00 +020015770 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15771 "PyUnicode_InternImmortal() is deprecated; "
15772 "use PyUnicode_InternInPlace() instead", 1) < 0)
15773 {
15774 // The function has no return value, the exception cannot
15775 // be reported to the caller, so just log it.
15776 PyErr_WriteUnraisable(NULL);
15777 }
15778
Benjamin Peterson14339b62009-01-31 16:36:08 +000015779 PyUnicode_InternInPlace(p);
15780 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015781 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015782 Py_INCREF(*p);
15783 }
Walter Dörwald16807132007-05-25 13:52:07 +000015784}
15785
15786PyObject *
15787PyUnicode_InternFromString(const char *cp)
15788{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015789 PyObject *s = PyUnicode_FromString(cp);
15790 if (s == NULL)
15791 return NULL;
15792 PyUnicode_InternInPlace(&s);
15793 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015794}
15795
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015796
Victor Stinner666ecfb2020-07-02 01:19:57 +020015797void
15798_PyUnicode_ClearInterned(PyThreadState *tstate)
Walter Dörwald16807132007-05-25 13:52:07 +000015799{
Victor Stinner666ecfb2020-07-02 01:19:57 +020015800 if (!_Py_IsMainInterpreter(tstate)) {
15801 // interned dict is shared by all interpreters
Benjamin Peterson14339b62009-01-31 16:36:08 +000015802 return;
15803 }
Walter Dörwald16807132007-05-25 13:52:07 +000015804
Victor Stinner666ecfb2020-07-02 01:19:57 +020015805 if (interned == NULL) {
15806 return;
15807 }
15808 assert(PyDict_CheckExact(interned));
15809
15810 PyObject *keys = PyDict_Keys(interned);
15811 if (keys == NULL) {
15812 PyErr_Clear();
15813 return;
15814 }
15815 assert(PyList_CheckExact(keys));
15816
15817 /* Interned unicode strings are not forcibly deallocated; rather, we give
15818 them their stolen references back, and then clear and DECREF the
15819 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015820
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015821 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015822#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015823 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015824
15825 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015826#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015827 for (Py_ssize_t i = 0; i < n; i++) {
15828 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015829 assert(PyUnicode_IS_READY(s));
15830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015831 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015832 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015833 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015834#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015835 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015836#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015837 break;
15838 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015839 // Restore the two references (key and value) ignored
15840 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015841 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015842#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015843 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015844#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015845 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015846 case SSTATE_NOT_INTERNED:
15847 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015848 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015849 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015851 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015852 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015853#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015854 fprintf(stderr,
15855 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15856 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015857#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015858 Py_DECREF(keys);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015859
Benjamin Peterson14339b62009-01-31 16:36:08 +000015860 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015861 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015862}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015863
15864
15865/********************* Unicode Iterator **************************/
15866
15867typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015868 PyObject_HEAD
15869 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015870 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015871} unicodeiterobject;
15872
15873static void
15874unicodeiter_dealloc(unicodeiterobject *it)
15875{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015876 _PyObject_GC_UNTRACK(it);
15877 Py_XDECREF(it->it_seq);
15878 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015879}
15880
15881static int
15882unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15883{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015884 Py_VISIT(it->it_seq);
15885 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015886}
15887
15888static PyObject *
15889unicodeiter_next(unicodeiterobject *it)
15890{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015891 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015892
Benjamin Peterson14339b62009-01-31 16:36:08 +000015893 assert(it != NULL);
15894 seq = it->it_seq;
15895 if (seq == NULL)
15896 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015897 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015899 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15900 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015901 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015902 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15903 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015904 if (item != NULL)
15905 ++it->it_index;
15906 return item;
15907 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015908
Benjamin Peterson14339b62009-01-31 16:36:08 +000015909 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015910 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015911 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015912}
15913
15914static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015915unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015916{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015917 Py_ssize_t len = 0;
15918 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015919 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015920 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015921}
15922
15923PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15924
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015925static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015926unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015927{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015928 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015929 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015930 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015931 it->it_seq, it->it_index);
15932 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015933 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015934 if (u == NULL)
15935 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015936 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015937 }
15938}
15939
15940PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15941
15942static PyObject *
15943unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15944{
15945 Py_ssize_t index = PyLong_AsSsize_t(state);
15946 if (index == -1 && PyErr_Occurred())
15947 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015948 if (it->it_seq != NULL) {
15949 if (index < 0)
15950 index = 0;
15951 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15952 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15953 it->it_index = index;
15954 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015955 Py_RETURN_NONE;
15956}
15957
15958PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15959
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015960static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015961 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015962 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015963 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15964 reduce_doc},
15965 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15966 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015967 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015968};
15969
15970PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015971 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15972 "str_iterator", /* tp_name */
15973 sizeof(unicodeiterobject), /* tp_basicsize */
15974 0, /* tp_itemsize */
15975 /* methods */
15976 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015977 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015978 0, /* tp_getattr */
15979 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015980 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015981 0, /* tp_repr */
15982 0, /* tp_as_number */
15983 0, /* tp_as_sequence */
15984 0, /* tp_as_mapping */
15985 0, /* tp_hash */
15986 0, /* tp_call */
15987 0, /* tp_str */
15988 PyObject_GenericGetAttr, /* tp_getattro */
15989 0, /* tp_setattro */
15990 0, /* tp_as_buffer */
15991 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15992 0, /* tp_doc */
15993 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15994 0, /* tp_clear */
15995 0, /* tp_richcompare */
15996 0, /* tp_weaklistoffset */
15997 PyObject_SelfIter, /* tp_iter */
15998 (iternextfunc)unicodeiter_next, /* tp_iternext */
15999 unicodeiter_methods, /* tp_methods */
16000 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016001};
16002
16003static PyObject *
16004unicode_iter(PyObject *seq)
16005{
Benjamin Peterson14339b62009-01-31 16:36:08 +000016006 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016007
Benjamin Peterson14339b62009-01-31 16:36:08 +000016008 if (!PyUnicode_Check(seq)) {
16009 PyErr_BadInternalCall();
16010 return NULL;
16011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020016012 if (PyUnicode_READY(seq) == -1)
16013 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016014 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16015 if (it == NULL)
16016 return NULL;
16017 it->it_index = 0;
16018 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020016019 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016020 _PyObject_GC_TRACK(it);
16021 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016022}
16023
Victor Stinner709d23d2019-05-02 14:56:30 -040016024static int
16025encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016026{
Victor Stinner709d23d2019-05-02 14:56:30 -040016027 int res;
16028 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16029 if (res == -2) {
16030 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16031 return -1;
16032 }
16033 if (res < 0) {
16034 PyErr_NoMemory();
16035 return -1;
16036 }
16037 return 0;
16038}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016039
Victor Stinner709d23d2019-05-02 14:56:30 -040016040
16041static int
16042config_get_codec_name(wchar_t **config_encoding)
16043{
16044 char *encoding;
16045 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16046 return -1;
16047 }
16048
16049 PyObject *name_obj = NULL;
16050 PyObject *codec = _PyCodec_Lookup(encoding);
16051 PyMem_RawFree(encoding);
16052
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016053 if (!codec)
16054 goto error;
16055
16056 name_obj = PyObject_GetAttrString(codec, "name");
16057 Py_CLEAR(codec);
16058 if (!name_obj) {
16059 goto error;
16060 }
16061
Victor Stinner709d23d2019-05-02 14:56:30 -040016062 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16063 Py_DECREF(name_obj);
16064 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016065 goto error;
16066 }
16067
Victor Stinner709d23d2019-05-02 14:56:30 -040016068 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16069 if (raw_wname == NULL) {
16070 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016071 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016072 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016073 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016074
16075 PyMem_RawFree(*config_encoding);
16076 *config_encoding = raw_wname;
16077
16078 PyMem_Free(wname);
16079 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016080
16081error:
16082 Py_XDECREF(codec);
16083 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016084 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016085}
16086
16087
Victor Stinner331a6a52019-05-27 16:39:22 +020016088static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016089init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016090{
Victor Stinner709d23d2019-05-02 14:56:30 -040016091 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016092 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016093 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016094 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016095 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016096 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016097 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016098}
16099
16100
Victor Stinner709d23d2019-05-02 14:56:30 -040016101static int
16102init_fs_codec(PyInterpreterState *interp)
16103{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016104 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016105
16106 _Py_error_handler error_handler;
16107 error_handler = get_error_handler_wide(config->filesystem_errors);
16108 if (error_handler == _Py_ERROR_UNKNOWN) {
16109 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16110 return -1;
16111 }
16112
16113 char *encoding, *errors;
16114 if (encode_wstr_utf8(config->filesystem_encoding,
16115 &encoding,
16116 "filesystem_encoding") < 0) {
16117 return -1;
16118 }
16119
16120 if (encode_wstr_utf8(config->filesystem_errors,
16121 &errors,
16122 "filesystem_errors") < 0) {
16123 PyMem_RawFree(encoding);
16124 return -1;
16125 }
16126
Victor Stinner3d17c042020-05-14 01:48:38 +020016127 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16128 PyMem_RawFree(fs_codec->encoding);
16129 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016130 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016131 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16132 PyMem_RawFree(fs_codec->errors);
16133 fs_codec->errors = errors;
16134 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016135
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016136#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016137 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016138#endif
16139
Victor Stinner709d23d2019-05-02 14:56:30 -040016140 /* At this point, PyUnicode_EncodeFSDefault() and
16141 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16142 the C implementation of the filesystem encoding. */
16143
16144 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16145 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016146 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16147 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016148 PyErr_NoMemory();
16149 return -1;
16150 }
16151 return 0;
16152}
16153
16154
Victor Stinner331a6a52019-05-27 16:39:22 +020016155static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016156init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016157{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016158 PyInterpreterState *interp = tstate->interp;
16159
Victor Stinner709d23d2019-05-02 14:56:30 -040016160 /* Update the filesystem encoding to the normalized Python codec name.
16161 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16162 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016163 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016164 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016165 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016166 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016167 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016168 }
16169
Victor Stinner709d23d2019-05-02 14:56:30 -040016170 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016171 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016172 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016173 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016174}
16175
16176
Victor Stinner331a6a52019-05-27 16:39:22 +020016177PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016178_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016179{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016180 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016181 if (_PyStatus_EXCEPTION(status)) {
16182 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016183 }
16184
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016185 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016186}
16187
16188
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016189static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016190_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016191{
Victor Stinner3d17c042020-05-14 01:48:38 +020016192 PyMem_RawFree(fs_codec->encoding);
16193 fs_codec->encoding = NULL;
16194 fs_codec->utf8 = 0;
16195 PyMem_RawFree(fs_codec->errors);
16196 fs_codec->errors = NULL;
16197 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016198}
16199
16200
Victor Stinner709d23d2019-05-02 14:56:30 -040016201#ifdef MS_WINDOWS
16202int
16203_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16204{
Victor Stinner81a7be32020-04-14 15:14:01 +020016205 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016206 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016207
16208 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16209 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16210 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16211 if (encoding == NULL || errors == NULL) {
16212 PyMem_RawFree(encoding);
16213 PyMem_RawFree(errors);
16214 PyErr_NoMemory();
16215 return -1;
16216 }
16217
16218 PyMem_RawFree(config->filesystem_encoding);
16219 config->filesystem_encoding = encoding;
16220 PyMem_RawFree(config->filesystem_errors);
16221 config->filesystem_errors = errors;
16222
16223 return init_fs_codec(interp);
16224}
16225#endif
16226
16227
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016228void
Victor Stinner3d483342019-11-22 12:27:50 +010016229_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016230{
Victor Stinner666ecfb2020-07-02 01:19:57 +020016231 // _PyUnicode_ClearInterned() must be called before
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016232
Victor Stinner666ecfb2020-07-02 01:19:57 +020016233 struct _Py_unicode_state *state = &tstate->interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016234
Victor Stinner91698d82020-06-25 14:07:40 +020016235 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016236
Victor Stinner2f9ada92020-06-24 02:22:21 +020016237 for (Py_ssize_t i = 0; i < 256; i++) {
16238 Py_CLEAR(state->latin1[i]);
16239 }
16240
Victor Stinner666ecfb2020-07-02 01:19:57 +020016241 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016242 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016243 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016244
Victor Stinner3d17c042020-05-14 01:48:38 +020016245 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016246}
16247
16248
Georg Brandl66c221e2010-10-14 07:04:07 +000016249/* A _string module, to export formatter_parser and formatter_field_name_split
16250 to the string.Formatter class implemented in Python. */
16251
16252static PyMethodDef _string_methods[] = {
16253 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16254 METH_O, PyDoc_STR("split the argument as a field name")},
16255 {"formatter_parser", (PyCFunction) formatter_parser,
16256 METH_O, PyDoc_STR("parse the argument as a format string")},
16257 {NULL, NULL}
16258};
16259
16260static struct PyModuleDef _string_module = {
16261 PyModuleDef_HEAD_INIT,
Victor Stinnerbb083d32020-09-08 15:33:08 +020016262 .m_name = "_string",
16263 .m_doc = PyDoc_STR("string helper module"),
16264 .m_size = 0,
16265 .m_methods = _string_methods,
Georg Brandl66c221e2010-10-14 07:04:07 +000016266};
16267
16268PyMODINIT_FUNC
16269PyInit__string(void)
16270{
Victor Stinnerbb083d32020-09-08 15:33:08 +020016271 return PyModuleDef_Init(&_string_module);
Georg Brandl66c221e2010-10-14 07:04:07 +000016272}
16273
16274
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016275#ifdef __cplusplus
16276}
16277#endif