blob: 3238d1e692a5175b12acf2707de085d69e9bdae2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinnerba3d67c2020-12-26 00:41:46 +010044#include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
Victor Stinner47e1afd2020-10-26 16:43:47 +010045#include "pycore_bytes_methods.h" // _Py_bytes_lower()
Serhiy Storchaka2ad93822020-12-03 12:46:16 +020046#include "pycore_format.h" // F_LJUST
Victor Stinner47e1afd2020-10-26 16:43:47 +010047#include "pycore_initconfig.h" // _PyStatus_OK()
48#include "pycore_interp.h" // PyInterpreterState.fs_codec
49#include "pycore_object.h" // _PyObject_GC_TRACK()
50#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
51#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
52#include "pycore_pystate.h" // _PyInterpreterState_GET()
53#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
54#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000056#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000057#include <windows.h>
58#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059
Victor Stinner666ecfb2020-07-02 01:19:57 +020060/* Uncomment to display statistics on interned strings at exit
61 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010062/* #define INTERNED_STATS 1 */
63
64
Larry Hastings61272b72014-01-07 12:41:53 -080065/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090066class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080067[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090068/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
69
70/*[python input]
71class Py_UCS4_converter(CConverter):
72 type = 'Py_UCS4'
73 converter = 'convert_uc'
74
75 def converter_init(self):
76 if self.default is not unspecified:
77 self.c_default = ascii(self.default)
78 if len(self.c_default) > 4 or self.c_default[0] != "'":
79 self.c_default = hex(ord(self.default))
80
81[python start generated code]*/
82/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080083
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
Serhiy Storchaka05997252013-01-26 12:14:02 +020086NOTE: In the interpreter's initialization phase, some globals are currently
87 initialized dynamically as needed. In the process Unicode objects may
88 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000089
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Victor Stinner8faf8212011-12-08 22:14:11 +010097/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
98#define MAX_UNICODE 0x10ffff
99
Victor Stinner910337b2011-10-03 03:20:16 +0200100#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200101# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200102#else
103# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
104#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200105
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106#define _PyUnicode_UTF8(op) \
107 (((PyCompactUnicodeObject*)(op))->utf8)
108#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((char*)((PyASCIIObject*)(op) + 1)) : \
113 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200114#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 (((PyCompactUnicodeObject*)(op))->utf8_length)
116#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200117 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200118 assert(PyUnicode_IS_READY(op)), \
119 PyUnicode_IS_COMPACT_ASCII(op) ? \
120 ((PyASCIIObject*)(op))->length : \
121 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200122#define _PyUnicode_WSTR(op) \
123 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900124
125/* Don't use deprecated macro of unicodeobject.h */
126#undef PyUnicode_WSTR_LENGTH
127#define PyUnicode_WSTR_LENGTH(op) \
128 (PyUnicode_IS_COMPACT_ASCII(op) ? \
129 ((PyASCIIObject*)op)->length : \
130 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200131#define _PyUnicode_WSTR_LENGTH(op) \
132 (((PyCompactUnicodeObject*)(op))->wstr_length)
133#define _PyUnicode_LENGTH(op) \
134 (((PyASCIIObject *)(op))->length)
135#define _PyUnicode_STATE(op) \
136 (((PyASCIIObject *)(op))->state)
137#define _PyUnicode_HASH(op) \
138 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200139#define _PyUnicode_KIND(op) \
140 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200141 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200142#define _PyUnicode_GET_LENGTH(op) \
143 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200144 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200145#define _PyUnicode_DATA_ANY(op) \
146 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200147
Victor Stinner910337b2011-10-03 03:20:16 +0200148#undef PyUnicode_READY
149#define PyUnicode_READY(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200152 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100153 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200154
Victor Stinnerc379ead2011-10-03 12:52:27 +0200155#define _PyUnicode_SHARE_UTF8(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
158 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
159#define _PyUnicode_SHARE_WSTR(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
162
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163/* true if the Unicode object has an allocated UTF-8 memory block
164 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200165#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200166 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
169
Victor Stinner03490912011-10-03 23:45:12 +0200170/* true if the Unicode object has an allocated wstr memory block
171 (not shared with other data) */
172#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200173 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200174 (!PyUnicode_IS_READY(op) || \
175 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
176
Victor Stinner910337b2011-10-03 03:20:16 +0200177/* Generic helper macro to convert characters of different types.
178 from_type and to_type have to be valid type names, begin and end
179 are pointers to the source characters which should be of type
180 "from_type *". to is a pointer of type "to_type *" and points to the
181 buffer where the result characters are written to. */
182#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
183 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100184 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600185 const from_type *_iter = (const from_type *)(begin);\
186 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200187 Py_ssize_t n = (_end) - (_iter); \
188 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200189 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200190 while (_iter < (_unrolled_end)) { \
191 _to[0] = (to_type) _iter[0]; \
192 _to[1] = (to_type) _iter[1]; \
193 _to[2] = (to_type) _iter[2]; \
194 _to[3] = (to_type) _iter[3]; \
195 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200196 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200197 while (_iter < (_end)) \
198 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200199 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200200
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200201#ifdef MS_WINDOWS
202 /* On Windows, overallocate by 50% is the best factor */
203# define OVERALLOCATE_FACTOR 2
204#else
205 /* On Linux, overallocate by 25% is the best factor */
206# define OVERALLOCATE_FACTOR 4
207#endif
208
Victor Stinner607b1022020-05-05 18:50:30 +0200209/* bpo-40521: Interned strings are shared by all interpreters. */
210#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
211# define INTERNED_STRINGS
212#endif
213
Walter Dörwald16807132007-05-25 13:52:07 +0000214/* This dictionary holds all interned unicode strings. Note that references
215 to strings in this dictionary are *not* counted in the string's ob_refcnt.
216 When the interned string reaches a refcnt of 0 the string deallocation
217 function will delete the reference from this dictionary.
218
219 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000220 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000221*/
Victor Stinner607b1022020-05-05 18:50:30 +0200222#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200224#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000225
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200226static struct _Py_unicode_state*
227get_unicode_state(void)
228{
229 PyInterpreterState *interp = _PyInterpreterState_GET();
230 return &interp->unicode;
231}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200232
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000233
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200234// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200235static inline PyObject* unicode_get_empty(void)
236{
237 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200238 // unicode_get_empty() must not be called before _PyUnicode_Init()
239 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200240 assert(state->empty_string != NULL);
241 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200242}
243
Victor Stinner91698d82020-06-25 14:07:40 +0200244
245// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200246static inline PyObject* unicode_new_empty(void)
247{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200248 PyObject *empty = unicode_get_empty();
249 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200250 return empty;
251}
252
253#define _Py_RETURN_UNICODE_EMPTY() \
254 do { \
255 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200256 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000257
Victor Stinner59423e32018-11-26 13:40:01 +0100258static inline void
259unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
260 Py_ssize_t start, Py_ssize_t length)
261{
262 assert(0 <= start);
263 assert(kind != PyUnicode_WCHAR_KIND);
264 switch (kind) {
265 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100266 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100267 Py_UCS1 ch = (unsigned char)value;
268 Py_UCS1 *to = (Py_UCS1 *)data + start;
269 memset(to, ch, length);
270 break;
271 }
272 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100273 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100274 Py_UCS2 ch = (Py_UCS2)value;
275 Py_UCS2 *to = (Py_UCS2 *)data + start;
276 const Py_UCS2 *end = to + length;
277 for (; to < end; ++to) *to = ch;
278 break;
279 }
280 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100281 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100282 Py_UCS4 ch = value;
283 Py_UCS4 * to = (Py_UCS4 *)data + start;
284 const Py_UCS4 *end = to + length;
285 for (; to < end; ++to) *to = ch;
286 break;
287 }
288 default: Py_UNREACHABLE();
289 }
290}
291
292
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200293/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700294static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200295_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900296static inline void
297_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400298static PyObject *
299unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
300 const char *errors);
301static PyObject *
302unicode_decode_utf8(const char *s, Py_ssize_t size,
303 _Py_error_handler error_handler, const char *errors,
304 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200305
Christian Heimes190d79e2008-01-30 11:58:22 +0000306/* Fast detection of the most frequent whitespace characters */
307const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000308 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000309/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000310/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000311/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000312/* case 0x000C: * FORM FEED */
313/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000314 0, 1, 1, 1, 1, 1, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000316/* case 0x001C: * FILE SEPARATOR */
317/* case 0x001D: * GROUP SEPARATOR */
318/* case 0x001E: * RECORD SEPARATOR */
319/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000321/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000322 1, 0, 0, 0, 0, 0, 0, 0,
323 0, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000326
Benjamin Peterson14339b62009-01-31 16:36:08 +0000327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000335};
336
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200337/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200338static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200339static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100340static int unicode_modifiable(PyObject *unicode);
341
Victor Stinnerfe226c02011-10-03 03:52:20 +0200342
Alexander Belopolsky40018472011-02-26 01:02:56 +0000343static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100344_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200345static PyObject *
346_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
347static PyObject *
348_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
349
350static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000351unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000352 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100353 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000354 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
355
Alexander Belopolsky40018472011-02-26 01:02:56 +0000356static void
357raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300358 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100359 PyObject *unicode,
360 Py_ssize_t startpos, Py_ssize_t endpos,
361 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000362
Christian Heimes190d79e2008-01-30 11:58:22 +0000363/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200364static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000366/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000367/* 0x000B, * LINE TABULATION */
368/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000369/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000370 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000371 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000372/* 0x001C, * FILE SEPARATOR */
373/* 0x001D, * GROUP SEPARATOR */
374/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000375 0, 0, 0, 0, 1, 1, 1, 0,
376 0, 0, 0, 0, 0, 0, 0, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000380
Benjamin Peterson14339b62009-01-31 16:36:08 +0000381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000389};
390
INADA Naoki3ae20562017-01-16 20:41:20 +0900391static int convert_uc(PyObject *obj, void *addr);
392
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300393#include "clinic/unicodeobject.c.h"
394
Victor Stinner3d4226a2018-08-29 22:21:32 +0200395_Py_error_handler
396_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200397{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200400 }
401 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200402 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200403 }
404 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200405 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200406 }
407 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200408 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200409 }
410 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200411 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200412 }
413 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200414 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200415 }
416 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200417 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200418 }
Victor Stinner50149202015-09-22 00:26:54 +0200419 return _Py_ERROR_OTHER;
420}
421
Victor Stinner709d23d2019-05-02 14:56:30 -0400422
423static _Py_error_handler
424get_error_handler_wide(const wchar_t *errors)
425{
426 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
427 return _Py_ERROR_STRICT;
428 }
429 if (wcscmp(errors, L"surrogateescape") == 0) {
430 return _Py_ERROR_SURROGATEESCAPE;
431 }
432 if (wcscmp(errors, L"replace") == 0) {
433 return _Py_ERROR_REPLACE;
434 }
435 if (wcscmp(errors, L"ignore") == 0) {
436 return _Py_ERROR_IGNORE;
437 }
438 if (wcscmp(errors, L"backslashreplace") == 0) {
439 return _Py_ERROR_BACKSLASHREPLACE;
440 }
441 if (wcscmp(errors, L"surrogatepass") == 0) {
442 return _Py_ERROR_SURROGATEPASS;
443 }
444 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
445 return _Py_ERROR_XMLCHARREFREPLACE;
446 }
447 return _Py_ERROR_OTHER;
448}
449
450
Victor Stinner22eb6892019-06-26 00:51:05 +0200451static inline int
452unicode_check_encoding_errors(const char *encoding, const char *errors)
453{
454 if (encoding == NULL && errors == NULL) {
455 return 0;
456 }
457
Victor Stinner81a7be32020-04-14 15:14:01 +0200458 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200459#ifndef Py_DEBUG
460 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200461 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200462 return 0;
463 }
464#else
465 /* Always check in debug mode */
466#endif
467
468 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
469 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200470 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200471 return 0;
472 }
473
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200474 /* Disable checks during Python finalization. For example, it allows to
475 call _PyObject_Dump() during finalization for debugging purpose. */
476 if (interp->finalizing) {
477 return 0;
478 }
479
Victor Stinner22eb6892019-06-26 00:51:05 +0200480 if (encoding != NULL) {
481 PyObject *handler = _PyCodec_Lookup(encoding);
482 if (handler == NULL) {
483 return -1;
484 }
485 Py_DECREF(handler);
486 }
487
488 if (errors != NULL) {
489 PyObject *handler = PyCodec_LookupError(errors);
490 if (handler == NULL) {
491 return -1;
492 }
493 Py_DECREF(handler);
494 }
495 return 0;
496}
497
498
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200499int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100500_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200501{
Victor Stinner68762572019-10-07 18:42:01 +0200502#define CHECK(expr) \
503 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
504
Victor Stinner910337b2011-10-03 03:20:16 +0200505 PyASCIIObject *ascii;
506 unsigned int kind;
507
Victor Stinner68762572019-10-07 18:42:01 +0200508 assert(op != NULL);
509 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200510
511 ascii = (PyASCIIObject *)op;
512 kind = ascii->state.kind;
513
Victor Stinnera3b334d2011-10-03 13:53:37 +0200514 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200515 CHECK(kind == PyUnicode_1BYTE_KIND);
516 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200517 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200518 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200519 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200520 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200521
Victor Stinnera41463c2011-10-04 01:05:08 +0200522 if (ascii->state.compact == 1) {
523 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200524 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200525 || kind == PyUnicode_2BYTE_KIND
526 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200527 CHECK(ascii->state.ascii == 0);
528 CHECK(ascii->state.ready == 1);
529 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100530 }
531 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200532 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
533
534 data = unicode->data.any;
535 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200536 CHECK(ascii->length == 0);
537 CHECK(ascii->hash == -1);
538 CHECK(ascii->state.compact == 0);
539 CHECK(ascii->state.ascii == 0);
540 CHECK(ascii->state.ready == 0);
541 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
542 CHECK(ascii->wstr != NULL);
543 CHECK(data == NULL);
544 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200545 }
546 else {
Victor Stinner68762572019-10-07 18:42:01 +0200547 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200548 || kind == PyUnicode_2BYTE_KIND
549 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200550 CHECK(ascii->state.compact == 0);
551 CHECK(ascii->state.ready == 1);
552 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200553 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200554 CHECK(compact->utf8 == data);
555 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200556 }
557 else
Victor Stinner68762572019-10-07 18:42:01 +0200558 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200559 }
560 }
561 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200562 if (
563#if SIZEOF_WCHAR_T == 2
564 kind == PyUnicode_2BYTE_KIND
565#else
566 kind == PyUnicode_4BYTE_KIND
567#endif
568 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200569 {
Victor Stinner68762572019-10-07 18:42:01 +0200570 CHECK(ascii->wstr == data);
571 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200572 } else
Victor Stinner68762572019-10-07 18:42:01 +0200573 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200574 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200575
576 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200577 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200578 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200579 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200580 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200581
582 /* check that the best kind is used: O(n) operation */
583 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200584 Py_ssize_t i;
585 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300586 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200587 Py_UCS4 ch;
588
589 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200590 for (i=0; i < ascii->length; i++)
591 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200592 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200593 if (ch > maxchar)
594 maxchar = ch;
595 }
596 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100597 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200598 CHECK(maxchar >= 128);
599 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100600 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200601 else
Victor Stinner68762572019-10-07 18:42:01 +0200602 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200603 }
Victor Stinner77faf692011-11-20 18:56:05 +0100604 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200605 CHECK(maxchar >= 0x100);
606 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100607 }
608 else {
Victor Stinner68762572019-10-07 18:42:01 +0200609 CHECK(maxchar >= 0x10000);
610 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100611 }
Victor Stinner68762572019-10-07 18:42:01 +0200612 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200613 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400614 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200615
616#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400617}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200618
Victor Stinner910337b2011-10-03 03:20:16 +0200619
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100620static PyObject*
621unicode_result_wchar(PyObject *unicode)
622{
623#ifndef Py_DEBUG
624 Py_ssize_t len;
625
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100626 len = _PyUnicode_WSTR_LENGTH(unicode);
627 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100628 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200629 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100630 }
631
632 if (len == 1) {
633 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100634 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100635 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200636 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100637 }
638 }
639
640 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200641 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100642 return NULL;
643 }
644#else
Victor Stinneraa771272012-10-04 02:32:58 +0200645 assert(Py_REFCNT(unicode) == 1);
646
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100647 /* don't make the result ready in debug mode to ensure that the caller
648 makes the string ready before using it */
649 assert(_PyUnicode_CheckConsistency(unicode, 1));
650#endif
651 return unicode;
652}
653
654static PyObject*
655unicode_result_ready(PyObject *unicode)
656{
657 Py_ssize_t length;
658
659 length = PyUnicode_GET_LENGTH(unicode);
660 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200661 PyObject *empty = unicode_get_empty();
662 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100663 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200664 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100665 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200666 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100667 }
668
669 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200670 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200671 if (kind == PyUnicode_1BYTE_KIND) {
672 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
673 Py_UCS1 ch = data[0];
674 struct _Py_unicode_state *state = get_unicode_state();
675 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100676 if (latin1_char != NULL) {
677 if (unicode != latin1_char) {
678 Py_INCREF(latin1_char);
679 Py_DECREF(unicode);
680 }
681 return latin1_char;
682 }
683 else {
684 assert(_PyUnicode_CheckConsistency(unicode, 1));
685 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200686 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100687 return unicode;
688 }
689 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200690 else {
691 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
692 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100693 }
694
695 assert(_PyUnicode_CheckConsistency(unicode, 1));
696 return unicode;
697}
698
699static PyObject*
700unicode_result(PyObject *unicode)
701{
702 assert(_PyUnicode_CHECK(unicode));
703 if (PyUnicode_IS_READY(unicode))
704 return unicode_result_ready(unicode);
705 else
706 return unicode_result_wchar(unicode);
707}
708
Victor Stinnerc4b49542011-12-11 22:44:26 +0100709static PyObject*
710unicode_result_unchanged(PyObject *unicode)
711{
712 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500713 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100714 return NULL;
715 Py_INCREF(unicode);
716 return unicode;
717 }
718 else
719 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100720 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100721}
722
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200723/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
724 ASCII, Latin1, UTF-8, etc. */
725static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200726backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200727 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
728{
Victor Stinnerad771582015-10-09 12:38:53 +0200729 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200730 Py_UCS4 ch;
731 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300732 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200733
734 assert(PyUnicode_IS_READY(unicode));
735 kind = PyUnicode_KIND(unicode);
736 data = PyUnicode_DATA(unicode);
737
738 size = 0;
739 /* determine replacement size */
740 for (i = collstart; i < collend; ++i) {
741 Py_ssize_t incr;
742
743 ch = PyUnicode_READ(kind, data, i);
744 if (ch < 0x100)
745 incr = 2+2;
746 else if (ch < 0x10000)
747 incr = 2+4;
748 else {
749 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200750 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200751 }
752 if (size > PY_SSIZE_T_MAX - incr) {
753 PyErr_SetString(PyExc_OverflowError,
754 "encoded result is too long for a Python string");
755 return NULL;
756 }
757 size += incr;
758 }
759
Victor Stinnerad771582015-10-09 12:38:53 +0200760 str = _PyBytesWriter_Prepare(writer, str, size);
761 if (str == NULL)
762 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200763
764 /* generate replacement */
765 for (i = collstart; i < collend; ++i) {
766 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200767 *str++ = '\\';
768 if (ch >= 0x00010000) {
769 *str++ = 'U';
770 *str++ = Py_hexdigits[(ch>>28)&0xf];
771 *str++ = Py_hexdigits[(ch>>24)&0xf];
772 *str++ = Py_hexdigits[(ch>>20)&0xf];
773 *str++ = Py_hexdigits[(ch>>16)&0xf];
774 *str++ = Py_hexdigits[(ch>>12)&0xf];
775 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200776 }
Victor Stinner797485e2015-10-09 03:17:30 +0200777 else if (ch >= 0x100) {
778 *str++ = 'u';
779 *str++ = Py_hexdigits[(ch>>12)&0xf];
780 *str++ = Py_hexdigits[(ch>>8)&0xf];
781 }
782 else
783 *str++ = 'x';
784 *str++ = Py_hexdigits[(ch>>4)&0xf];
785 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200786 }
787 return str;
788}
789
790/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
791 ASCII, Latin1, UTF-8, etc. */
792static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200793xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200794 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
795{
Victor Stinnerad771582015-10-09 12:38:53 +0200796 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200797 Py_UCS4 ch;
798 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300799 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200800
801 assert(PyUnicode_IS_READY(unicode));
802 kind = PyUnicode_KIND(unicode);
803 data = PyUnicode_DATA(unicode);
804
805 size = 0;
806 /* determine replacement size */
807 for (i = collstart; i < collend; ++i) {
808 Py_ssize_t incr;
809
810 ch = PyUnicode_READ(kind, data, i);
811 if (ch < 10)
812 incr = 2+1+1;
813 else if (ch < 100)
814 incr = 2+2+1;
815 else if (ch < 1000)
816 incr = 2+3+1;
817 else if (ch < 10000)
818 incr = 2+4+1;
819 else if (ch < 100000)
820 incr = 2+5+1;
821 else if (ch < 1000000)
822 incr = 2+6+1;
823 else {
824 assert(ch <= MAX_UNICODE);
825 incr = 2+7+1;
826 }
827 if (size > PY_SSIZE_T_MAX - incr) {
828 PyErr_SetString(PyExc_OverflowError,
829 "encoded result is too long for a Python string");
830 return NULL;
831 }
832 size += incr;
833 }
834
Victor Stinnerad771582015-10-09 12:38:53 +0200835 str = _PyBytesWriter_Prepare(writer, str, size);
836 if (str == NULL)
837 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200838
839 /* generate replacement */
840 for (i = collstart; i < collend; ++i) {
Christian Heimes07f2ade2020-11-18 16:38:53 +0100841 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
842 if (size < 0) {
843 return NULL;
844 }
845 str += size;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200846 }
847 return str;
848}
849
Thomas Wouters477c8d52006-05-27 19:21:47 +0000850/* --- Bloom Filters ----------------------------------------------------- */
851
852/* stuff to implement simple "bloom filters" for Unicode characters.
853 to keep things simple, we use a single bitmask, using the least 5
854 bits from each unicode characters as the bit index. */
855
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200856/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857
Antoine Pitrouf068f942010-01-13 14:19:12 +0000858#if LONG_BIT >= 128
859#define BLOOM_WIDTH 128
860#elif LONG_BIT >= 64
861#define BLOOM_WIDTH 64
862#elif LONG_BIT >= 32
863#define BLOOM_WIDTH 32
864#else
865#error "LONG_BIT is smaller than 32"
866#endif
867
Thomas Wouters477c8d52006-05-27 19:21:47 +0000868#define BLOOM_MASK unsigned long
869
Serhiy Storchaka05997252013-01-26 12:14:02 +0200870static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000871
Antoine Pitrouf068f942010-01-13 14:19:12 +0000872#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000873
Benjamin Peterson29060642009-01-31 22:14:21 +0000874#define BLOOM_LINEBREAK(ch) \
875 ((ch) < 128U ? ascii_linebreak[(ch)] : \
876 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700878static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300879make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000880{
Victor Stinnera85af502013-04-09 21:53:54 +0200881#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
882 do { \
883 TYPE *data = (TYPE *)PTR; \
884 TYPE *end = data + LEN; \
885 Py_UCS4 ch; \
886 for (; data != end; data++) { \
887 ch = *data; \
888 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
889 } \
890 break; \
891 } while (0)
892
Thomas Wouters477c8d52006-05-27 19:21:47 +0000893 /* calculate simple bloom-style bitmask for a given unicode string */
894
Antoine Pitrouf068f942010-01-13 14:19:12 +0000895 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000896
897 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200898 switch (kind) {
899 case PyUnicode_1BYTE_KIND:
900 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
901 break;
902 case PyUnicode_2BYTE_KIND:
903 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
904 break;
905 case PyUnicode_4BYTE_KIND:
906 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
907 break;
908 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700909 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200910 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000911 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200912
913#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000914}
915
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300916static int
917ensure_unicode(PyObject *obj)
918{
919 if (!PyUnicode_Check(obj)) {
920 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200921 "must be str, not %.100s",
922 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300923 return -1;
924 }
925 return PyUnicode_READY(obj);
926}
927
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200928/* Compilation of templated routines */
929
Victor Stinner90ed8a62020-06-24 00:34:07 +0200930#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200931
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200932#include "stringlib/asciilib.h"
933#include "stringlib/fastsearch.h"
934#include "stringlib/partition.h"
935#include "stringlib/split.h"
936#include "stringlib/count.h"
937#include "stringlib/find.h"
938#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200939#include "stringlib/undef.h"
940
941#include "stringlib/ucs1lib.h"
942#include "stringlib/fastsearch.h"
943#include "stringlib/partition.h"
944#include "stringlib/split.h"
945#include "stringlib/count.h"
946#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300947#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200948#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200949#include "stringlib/undef.h"
950
951#include "stringlib/ucs2lib.h"
952#include "stringlib/fastsearch.h"
953#include "stringlib/partition.h"
954#include "stringlib/split.h"
955#include "stringlib/count.h"
956#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300957#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200958#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200959#include "stringlib/undef.h"
960
961#include "stringlib/ucs4lib.h"
962#include "stringlib/fastsearch.h"
963#include "stringlib/partition.h"
964#include "stringlib/split.h"
965#include "stringlib/count.h"
966#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300967#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200968#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200969#include "stringlib/undef.h"
970
Inada Naoki2c4928d2020-06-17 20:09:44 +0900971_Py_COMP_DIAG_PUSH
972_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200973#include "stringlib/unicodedefs.h"
974#include "stringlib/fastsearch.h"
975#include "stringlib/count.h"
976#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100977#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900978_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200979
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200980#undef STRINGLIB_GET_EMPTY
981
Guido van Rossumd57fd912000-03-10 22:53:23 +0000982/* --- Unicode Object ----------------------------------------------------- */
983
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700984static inline Py_ssize_t
985findchar(const void *s, int kind,
986 Py_ssize_t size, Py_UCS4 ch,
987 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200989 switch (kind) {
990 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200991 if ((Py_UCS1) ch != ch)
992 return -1;
993 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600994 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600996 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200997 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200998 if ((Py_UCS2) ch != ch)
999 return -1;
1000 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001001 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001004 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001005 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001006 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001007 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001008 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001009 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001010 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001012}
1013
Victor Stinnerafffce42012-10-03 23:03:17 +02001014#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001015/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001016 earlier.
1017
1018 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1019 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1020 invalid character in Unicode 6.0. */
1021static void
1022unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1023{
1024 int kind = PyUnicode_KIND(unicode);
1025 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1026 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1027 if (length <= old_length)
1028 return;
1029 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1030}
1031#endif
1032
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033static PyObject*
1034resize_compact(PyObject *unicode, Py_ssize_t length)
1035{
1036 Py_ssize_t char_size;
1037 Py_ssize_t struct_size;
1038 Py_ssize_t new_size;
1039 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001040 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001041#ifdef Py_DEBUG
1042 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1043#endif
1044
Victor Stinner79891572012-05-03 13:43:07 +02001045 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001047 assert(PyUnicode_IS_COMPACT(unicode));
1048
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001049 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001050 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001051 struct_size = sizeof(PyASCIIObject);
1052 else
1053 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001054 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001055
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1057 PyErr_NoMemory();
1058 return NULL;
1059 }
1060 new_size = (struct_size + (length + 1) * char_size);
1061
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001062 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001063 PyObject_Free(_PyUnicode_UTF8(unicode));
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001064 _PyUnicode_UTF8(unicode) = NULL;
1065 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1066 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001067#ifdef Py_REF_DEBUG
1068 _Py_RefTotal--;
1069#endif
1070#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001071 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001072#endif
Victor Stinner84def372011-12-11 20:04:56 +01001073
Victor Stinner32bd68c2020-12-01 10:37:39 +01001074 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001075 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001076 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001077 PyErr_NoMemory();
1078 return NULL;
1079 }
Victor Stinner84def372011-12-11 20:04:56 +01001080 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001081 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001082
Victor Stinnerfe226c02011-10-03 03:52:20 +02001083 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001085 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001086 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001087 _PyUnicode_WSTR_LENGTH(unicode) = length;
1088 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001089 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001090 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001092 if (!PyUnicode_IS_ASCII(unicode))
1093 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001094 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001095#ifdef Py_DEBUG
1096 unicode_fill_invalid(unicode, old_length);
1097#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1099 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001100 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001101 return unicode;
1102}
1103
Alexander Belopolsky40018472011-02-26 01:02:56 +00001104static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001105resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106{
Victor Stinner95663112011-10-04 01:03:50 +02001107 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001108 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001110 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001111
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 if (PyUnicode_IS_READY(unicode)) {
1113 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001114 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001115 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001116#ifdef Py_DEBUG
1117 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1118#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001119
1120 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001121 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001122 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1123 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001124
1125 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1126 PyErr_NoMemory();
1127 return -1;
1128 }
1129 new_size = (length + 1) * char_size;
1130
Victor Stinner7a9105a2011-12-12 00:13:42 +01001131 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1132 {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001133 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinner7a9105a2011-12-12 00:13:42 +01001134 _PyUnicode_UTF8(unicode) = NULL;
1135 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1136 }
1137
Victor Stinner32bd68c2020-12-01 10:37:39 +01001138 data = (PyObject *)PyObject_Realloc(data, new_size);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001139 if (data == NULL) {
1140 PyErr_NoMemory();
1141 return -1;
1142 }
1143 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001144 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001145 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001146 _PyUnicode_WSTR_LENGTH(unicode) = length;
1147 }
1148 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001149 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001150 _PyUnicode_UTF8_LENGTH(unicode) = length;
1151 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001152 _PyUnicode_LENGTH(unicode) = length;
1153 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001154#ifdef Py_DEBUG
1155 unicode_fill_invalid(unicode, old_length);
1156#endif
Victor Stinner95663112011-10-04 01:03:50 +02001157 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001158 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001159 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001160 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001161 }
Victor Stinner95663112011-10-04 01:03:50 +02001162 assert(_PyUnicode_WSTR(unicode) != NULL);
1163
1164 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001165 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001166 PyErr_NoMemory();
1167 return -1;
1168 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001169 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001170 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001171 wstr = PyObject_Realloc(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001172 if (!wstr) {
1173 PyErr_NoMemory();
1174 return -1;
1175 }
1176 _PyUnicode_WSTR(unicode) = wstr;
1177 _PyUnicode_WSTR(unicode)[length] = 0;
1178 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001179 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 return 0;
1181}
1182
Victor Stinnerfe226c02011-10-03 03:52:20 +02001183static PyObject*
1184resize_copy(PyObject *unicode, Py_ssize_t length)
1185{
1186 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001187 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001189
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001190 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001191
1192 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1193 if (copy == NULL)
1194 return NULL;
1195
1196 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001197 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001198 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001199 }
1200 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001201 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001202
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001203 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001204 if (w == NULL)
1205 return NULL;
1206 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1207 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001208 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001209 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001210 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001211 }
1212}
1213
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001215 Ux0000 terminated; some code (e.g. new_identifier)
1216 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001219 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220
1221*/
1222
Alexander Belopolsky40018472011-02-26 01:02:56 +00001223static PyUnicodeObject *
1224_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001226 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228
Thomas Wouters477c8d52006-05-27 19:21:47 +00001229 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001230 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001231 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 }
1233
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001234 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001235 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001236 return (PyUnicodeObject *)PyErr_NoMemory();
1237 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 if (length < 0) {
1239 PyErr_SetString(PyExc_SystemError,
1240 "Negative size passed to _PyUnicode_New");
1241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 }
1243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1245 if (unicode == NULL)
1246 return NULL;
1247 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001248
1249 _PyUnicode_WSTR_LENGTH(unicode) = length;
1250 _PyUnicode_HASH(unicode) = -1;
1251 _PyUnicode_STATE(unicode).interned = 0;
1252 _PyUnicode_STATE(unicode).kind = 0;
1253 _PyUnicode_STATE(unicode).compact = 0;
1254 _PyUnicode_STATE(unicode).ready = 0;
1255 _PyUnicode_STATE(unicode).ascii = 0;
1256 _PyUnicode_DATA_ANY(unicode) = NULL;
1257 _PyUnicode_LENGTH(unicode) = 0;
1258 _PyUnicode_UTF8(unicode) = NULL;
1259 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1260
Victor Stinner32bd68c2020-12-01 10:37:39 +01001261 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001263 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001264 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001265 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267
Jeremy Hyltond8082792003-09-16 19:41:39 +00001268 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001269 * the caller fails before initializing str -- unicode_resize()
1270 * reads str[0], and the Keep-Alive optimization can keep memory
1271 * allocated for str alive across a call to unicode_dealloc(unicode).
1272 * We don't want unicode_resize to read uninitialized memory in
1273 * that case.
1274 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275 _PyUnicode_WSTR(unicode)[0] = 0;
1276 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001277
Victor Stinner7931d9a2011-11-04 00:22:48 +01001278 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 return unicode;
1280}
1281
Victor Stinnerf42dc442011-10-02 23:33:16 +02001282static const char*
1283unicode_kind_name(PyObject *unicode)
1284{
Victor Stinner42dfd712011-10-03 14:41:45 +02001285 /* don't check consistency: unicode_kind_name() is called from
1286 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001287 if (!PyUnicode_IS_COMPACT(unicode))
1288 {
1289 if (!PyUnicode_IS_READY(unicode))
1290 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001291 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292 {
1293 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001294 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001295 return "legacy ascii";
1296 else
1297 return "legacy latin1";
1298 case PyUnicode_2BYTE_KIND:
1299 return "legacy UCS2";
1300 case PyUnicode_4BYTE_KIND:
1301 return "legacy UCS4";
1302 default:
1303 return "<legacy invalid kind>";
1304 }
1305 }
1306 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001307 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001308 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001309 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001310 return "ascii";
1311 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001312 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001313 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001314 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001315 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001316 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001317 default:
1318 return "<invalid compact kind>";
1319 }
1320}
1321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001324const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001325 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001326 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327}
1328
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001329const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001330 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return _PyUnicode_COMPACT_DATA(unicode);
1332}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001333const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001334 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001335 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1337 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1338 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1339 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1340 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1341 return PyUnicode_DATA(unicode);
1342}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001343
1344void
1345_PyUnicode_Dump(PyObject *op)
1346{
1347 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001348 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1349 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001350 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001351
Victor Stinnera849a4b2011-10-03 12:12:11 +02001352 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001353 {
1354 if (ascii->state.ascii)
1355 data = (ascii + 1);
1356 else
1357 data = (compact + 1);
1358 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001359 else
1360 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001361 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001362
Victor Stinnera849a4b2011-10-03 12:12:11 +02001363 if (ascii->wstr == data)
1364 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001365 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001366
Victor Stinnera3b334d2011-10-03 13:53:37 +02001367 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001368 printf(" (%zu), ", compact->wstr_length);
1369 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001370 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001371 }
1372 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001373 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001374 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001375}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376#endif
1377
Victor Stinner91698d82020-06-25 14:07:40 +02001378static int
1379unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1380{
1381 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1382 // optimized to always use state->empty_string without having to check if
1383 // it is NULL or not.
1384 PyObject *empty = PyUnicode_New(1, 0);
1385 if (empty == NULL) {
1386 return -1;
1387 }
1388 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1389 _PyUnicode_LENGTH(empty) = 0;
1390 assert(_PyUnicode_CheckConsistency(empty, 1));
1391
1392 assert(state->empty_string == NULL);
1393 state->empty_string = empty;
1394 return 0;
1395}
1396
1397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398PyObject *
1399PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1400{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001401 /* Optimization for empty strings */
1402 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001403 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001404 }
1405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 PyObject *obj;
1407 PyCompactUnicodeObject *unicode;
1408 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001409 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001410 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 Py_ssize_t char_size;
1412 Py_ssize_t struct_size;
1413
Victor Stinner9e9d6892011-10-04 01:02:02 +02001414 is_ascii = 0;
1415 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 struct_size = sizeof(PyCompactUnicodeObject);
1417 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001418 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 char_size = 1;
1420 is_ascii = 1;
1421 struct_size = sizeof(PyASCIIObject);
1422 }
1423 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001424 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 char_size = 1;
1426 }
1427 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001428 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 char_size = 2;
1430 if (sizeof(wchar_t) == 2)
1431 is_sharing = 1;
1432 }
1433 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001434 if (maxchar > MAX_UNICODE) {
1435 PyErr_SetString(PyExc_SystemError,
1436 "invalid maximum character passed to PyUnicode_New");
1437 return NULL;
1438 }
Victor Stinner8f825062012-04-27 13:55:39 +02001439 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 char_size = 4;
1441 if (sizeof(wchar_t) == 4)
1442 is_sharing = 1;
1443 }
1444
1445 /* Ensure we won't overflow the size. */
1446 if (size < 0) {
1447 PyErr_SetString(PyExc_SystemError,
1448 "Negative size passed to PyUnicode_New");
1449 return NULL;
1450 }
1451 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1452 return PyErr_NoMemory();
1453
1454 /* Duplicated allocation code from _PyObject_New() instead of a call to
1455 * PyObject_New() so we are able to allocate space for the object and
1456 * it's data buffer.
1457 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001458 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001459 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001461 }
1462 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
1464 unicode = (PyCompactUnicodeObject *)obj;
1465 if (is_ascii)
1466 data = ((PyASCIIObject*)obj) + 1;
1467 else
1468 data = unicode + 1;
1469 _PyUnicode_LENGTH(unicode) = size;
1470 _PyUnicode_HASH(unicode) = -1;
1471 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001472 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 _PyUnicode_STATE(unicode).compact = 1;
1474 _PyUnicode_STATE(unicode).ready = 1;
1475 _PyUnicode_STATE(unicode).ascii = is_ascii;
1476 if (is_ascii) {
1477 ((char*)data)[size] = 0;
1478 _PyUnicode_WSTR(unicode) = NULL;
1479 }
Victor Stinner8f825062012-04-27 13:55:39 +02001480 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 ((char*)data)[size] = 0;
1482 _PyUnicode_WSTR(unicode) = NULL;
1483 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001485 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 else {
1488 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001489 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001490 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001492 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493 ((Py_UCS4*)data)[size] = 0;
1494 if (is_sharing) {
1495 _PyUnicode_WSTR_LENGTH(unicode) = size;
1496 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1497 }
1498 else {
1499 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1500 _PyUnicode_WSTR(unicode) = NULL;
1501 }
1502 }
Victor Stinner8f825062012-04-27 13:55:39 +02001503#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001504 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001505#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001506 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 return obj;
1508}
1509
1510#if SIZEOF_WCHAR_T == 2
1511/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1512 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001513 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514
1515 This function assumes that unicode can hold one more code point than wstr
1516 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001517static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001519 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520{
1521 const wchar_t *iter;
1522 Py_UCS4 *ucs4_out;
1523
Victor Stinner910337b2011-10-03 03:20:16 +02001524 assert(unicode != NULL);
1525 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1527 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1528
1529 for (iter = begin; iter < end; ) {
1530 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1531 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001532 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1533 && (iter+1) < end
1534 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001535 {
Victor Stinner551ac952011-11-29 22:58:13 +01001536 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 iter += 2;
1538 }
1539 else {
1540 *ucs4_out++ = *iter;
1541 iter++;
1542 }
1543 }
1544 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1545 _PyUnicode_GET_LENGTH(unicode)));
1546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547}
1548#endif
1549
Victor Stinnercd9950f2011-10-02 00:34:53 +02001550static int
Victor Stinner488fa492011-12-12 00:01:39 +01001551unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001552{
Victor Stinner488fa492011-12-12 00:01:39 +01001553 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001554 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001555 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001556 return -1;
1557 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001558 return 0;
1559}
1560
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001561static int
1562_copy_characters(PyObject *to, Py_ssize_t to_start,
1563 PyObject *from, Py_ssize_t from_start,
1564 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001566 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001567 const void *from_data;
1568 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569
Victor Stinneree4544c2012-05-09 22:24:08 +02001570 assert(0 <= how_many);
1571 assert(0 <= from_start);
1572 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001573 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001574 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001575 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576
Victor Stinnerd3f08822012-05-29 12:57:52 +02001577 assert(PyUnicode_Check(to));
1578 assert(PyUnicode_IS_READY(to));
1579 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1580
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001581 if (how_many == 0)
1582 return 0;
1583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001585 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001587 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588
Victor Stinnerf1852262012-06-16 16:38:26 +02001589#ifdef Py_DEBUG
1590 if (!check_maxchar
1591 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1592 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001593 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001594 Py_UCS4 ch;
1595 Py_ssize_t i;
1596 for (i=0; i < how_many; i++) {
1597 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1598 assert(ch <= to_maxchar);
1599 }
1600 }
1601#endif
1602
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001603 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001604 if (check_maxchar
1605 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1606 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001607 /* Writing Latin-1 characters into an ASCII string requires to
1608 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001609 Py_UCS4 max_char;
1610 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001611 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001612 if (max_char >= 128)
1613 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001614 }
Christian Heimesf051e432016-09-13 20:22:02 +02001615 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001616 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001617 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001619 else if (from_kind == PyUnicode_1BYTE_KIND
1620 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001621 {
1622 _PyUnicode_CONVERT_BYTES(
1623 Py_UCS1, Py_UCS2,
1624 PyUnicode_1BYTE_DATA(from) + from_start,
1625 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1626 PyUnicode_2BYTE_DATA(to) + to_start
1627 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001628 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001629 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001630 && to_kind == PyUnicode_4BYTE_KIND)
1631 {
1632 _PyUnicode_CONVERT_BYTES(
1633 Py_UCS1, Py_UCS4,
1634 PyUnicode_1BYTE_DATA(from) + from_start,
1635 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1636 PyUnicode_4BYTE_DATA(to) + to_start
1637 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001638 }
1639 else if (from_kind == PyUnicode_2BYTE_KIND
1640 && to_kind == PyUnicode_4BYTE_KIND)
1641 {
1642 _PyUnicode_CONVERT_BYTES(
1643 Py_UCS2, Py_UCS4,
1644 PyUnicode_2BYTE_DATA(from) + from_start,
1645 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1646 PyUnicode_4BYTE_DATA(to) + to_start
1647 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001648 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001649 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001650 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1651
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001652 if (!check_maxchar) {
1653 if (from_kind == PyUnicode_2BYTE_KIND
1654 && to_kind == PyUnicode_1BYTE_KIND)
1655 {
1656 _PyUnicode_CONVERT_BYTES(
1657 Py_UCS2, Py_UCS1,
1658 PyUnicode_2BYTE_DATA(from) + from_start,
1659 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1660 PyUnicode_1BYTE_DATA(to) + to_start
1661 );
1662 }
1663 else if (from_kind == PyUnicode_4BYTE_KIND
1664 && to_kind == PyUnicode_1BYTE_KIND)
1665 {
1666 _PyUnicode_CONVERT_BYTES(
1667 Py_UCS4, Py_UCS1,
1668 PyUnicode_4BYTE_DATA(from) + from_start,
1669 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1670 PyUnicode_1BYTE_DATA(to) + to_start
1671 );
1672 }
1673 else if (from_kind == PyUnicode_4BYTE_KIND
1674 && to_kind == PyUnicode_2BYTE_KIND)
1675 {
1676 _PyUnicode_CONVERT_BYTES(
1677 Py_UCS4, Py_UCS2,
1678 PyUnicode_4BYTE_DATA(from) + from_start,
1679 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1680 PyUnicode_2BYTE_DATA(to) + to_start
1681 );
1682 }
1683 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001684 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001685 }
1686 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001687 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001688 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001689 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001690 Py_ssize_t i;
1691
Victor Stinnera0702ab2011-09-29 14:14:38 +02001692 for (i=0; i < how_many; i++) {
1693 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001694 if (ch > to_maxchar)
1695 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001696 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1697 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001698 }
1699 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001700 return 0;
1701}
1702
Victor Stinnerd3f08822012-05-29 12:57:52 +02001703void
1704_PyUnicode_FastCopyCharacters(
1705 PyObject *to, Py_ssize_t to_start,
1706 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001707{
1708 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1709}
1710
1711Py_ssize_t
1712PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1713 PyObject *from, Py_ssize_t from_start,
1714 Py_ssize_t how_many)
1715{
1716 int err;
1717
1718 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1719 PyErr_BadInternalCall();
1720 return -1;
1721 }
1722
Benjamin Petersonbac79492012-01-14 13:34:47 -05001723 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001724 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001725 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001726 return -1;
1727
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001728 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001729 PyErr_SetString(PyExc_IndexError, "string index out of range");
1730 return -1;
1731 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001732 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001733 PyErr_SetString(PyExc_IndexError, "string index out of range");
1734 return -1;
1735 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001736 if (how_many < 0) {
1737 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1738 return -1;
1739 }
1740 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001741 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1742 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001743 "Cannot write %zi characters at %zi "
1744 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001745 how_many, to_start, PyUnicode_GET_LENGTH(to));
1746 return -1;
1747 }
1748
1749 if (how_many == 0)
1750 return 0;
1751
Victor Stinner488fa492011-12-12 00:01:39 +01001752 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001753 return -1;
1754
1755 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1756 if (err) {
1757 PyErr_Format(PyExc_SystemError,
1758 "Cannot copy %s characters "
1759 "into a string of %s characters",
1760 unicode_kind_name(from),
1761 unicode_kind_name(to));
1762 return -1;
1763 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001764 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765}
1766
Victor Stinner17222162011-09-28 22:15:37 +02001767/* Find the maximum code point and count the number of surrogate pairs so a
1768 correct string length can be computed before converting a string to UCS4.
1769 This function counts single surrogates as a character and not as a pair.
1770
1771 Return 0 on success, or -1 on error. */
1772static int
1773find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1774 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775{
1776 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001777 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778
Victor Stinnerc53be962011-10-02 21:33:54 +02001779 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780 *num_surrogates = 0;
1781 *maxchar = 0;
1782
1783 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001785 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1786 && (iter+1) < end
1787 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1788 {
1789 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1790 ++(*num_surrogates);
1791 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 }
1793 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001795 {
1796 ch = *iter;
1797 iter++;
1798 }
1799 if (ch > *maxchar) {
1800 *maxchar = ch;
1801 if (*maxchar > MAX_UNICODE) {
1802 PyErr_Format(PyExc_ValueError,
1803 "character U+%x is not in range [U+0000; U+10ffff]",
1804 ch);
1805 return -1;
1806 }
1807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808 }
1809 return 0;
1810}
1811
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001812int
1813_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814{
1815 wchar_t *end;
1816 Py_UCS4 maxchar = 0;
1817 Py_ssize_t num_surrogates;
1818#if SIZEOF_WCHAR_T == 2
1819 Py_ssize_t length_wo_surrogates;
1820#endif
1821
Georg Brandl7597add2011-10-05 16:36:47 +02001822 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001823 strings were created using _PyObject_New() and where no canonical
1824 representation (the str field) has been set yet aka strings
1825 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001826 assert(_PyUnicode_CHECK(unicode));
1827 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001830 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001831 /* Actually, it should neither be interned nor be anything else: */
1832 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001835 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001836 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838
1839 if (maxchar < 256) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001840 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001841 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 PyErr_NoMemory();
1843 return -1;
1844 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001845 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 _PyUnicode_WSTR(unicode), end,
1847 PyUnicode_1BYTE_DATA(unicode));
1848 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1849 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1850 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1851 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001852 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001853 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001854 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 }
1856 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001858 _PyUnicode_UTF8(unicode) = NULL;
1859 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001861 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 _PyUnicode_WSTR(unicode) = NULL;
1863 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1864 }
1865 /* In this case we might have to convert down from 4-byte native
1866 wchar_t to 2-byte unicode. */
1867 else if (maxchar < 65536) {
1868 assert(num_surrogates == 0 &&
1869 "FindMaxCharAndNumSurrogatePairs() messed up");
1870
Victor Stinner506f5922011-09-28 22:34:18 +02001871#if SIZEOF_WCHAR_T == 2
1872 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001873 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001874 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1875 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1876 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001877 _PyUnicode_UTF8(unicode) = NULL;
1878 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001879#else
1880 /* sizeof(wchar_t) == 4 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001881 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
Victor Stinner506f5922011-09-28 22:34:18 +02001882 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001883 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001884 PyErr_NoMemory();
1885 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 }
Victor Stinner506f5922011-09-28 22:34:18 +02001887 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1888 _PyUnicode_WSTR(unicode), end,
1889 PyUnicode_2BYTE_DATA(unicode));
1890 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1891 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1892 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001893 _PyUnicode_UTF8(unicode) = NULL;
1894 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner32bd68c2020-12-01 10:37:39 +01001895 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinner506f5922011-09-28 22:34:18 +02001896 _PyUnicode_WSTR(unicode) = NULL;
1897 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1898#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 }
Ikko Ashimine38811d62020-11-10 14:57:34 +09001900 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 else {
1902#if SIZEOF_WCHAR_T == 2
1903 /* in case the native representation is 2-bytes, we need to allocate a
1904 new normalized 4-byte version. */
1905 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001906 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1907 PyErr_NoMemory();
1908 return -1;
1909 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001910 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001911 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 PyErr_NoMemory();
1913 return -1;
1914 }
1915 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1916 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001917 _PyUnicode_UTF8(unicode) = NULL;
1918 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001919 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1920 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001921 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001922 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 _PyUnicode_WSTR(unicode) = NULL;
1924 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1925#else
1926 assert(num_surrogates == 0);
1927
Victor Stinnerc3c74152011-10-02 20:39:55 +02001928 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001930 _PyUnicode_UTF8(unicode) = NULL;
1931 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1933#endif
1934 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1935 }
1936 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001937 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001938 return 0;
1939}
1940
Alexander Belopolsky40018472011-02-26 01:02:56 +00001941static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001942unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943{
Walter Dörwald16807132007-05-25 13:52:07 +00001944 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001945 case SSTATE_NOT_INTERNED:
1946 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001947
Benjamin Peterson29060642009-01-31 22:14:21 +00001948 case SSTATE_INTERNED_MORTAL:
Victor Stinner607b1022020-05-05 18:50:30 +02001949#ifdef INTERNED_STRINGS
Victor Stinner3549ca32020-07-03 16:59:12 +02001950 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1951 references (key and value) which were ignored by
1952 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1953 to prevent calling unicode_dealloc() again. Adjust refcnt after
1954 PyDict_DelItem(). */
1955 assert(Py_REFCNT(unicode) == 0);
1956 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001957 if (PyDict_DelItem(interned, unicode) != 0) {
1958 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1959 NULL);
1960 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001961 assert(Py_REFCNT(unicode) == 1);
1962 Py_SET_REFCNT(unicode, 0);
Victor Stinner607b1022020-05-05 18:50:30 +02001963#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001964 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001965
Benjamin Peterson29060642009-01-31 22:14:21 +00001966 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001967 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1968 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001969
Benjamin Peterson29060642009-01-31 22:14:21 +00001970 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001971 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001972 }
1973
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001974 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001975 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001976 }
1977 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001978 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001979 }
1980 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001981 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001984 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985}
1986
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001987#ifdef Py_DEBUG
1988static int
1989unicode_is_singleton(PyObject *unicode)
1990{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001991 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001992 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001993 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001994 }
Victor Stinner607b1022020-05-05 18:50:30 +02001995 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001996 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1997 {
1998 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001999 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002000 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02002001 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002002 }
2003 return 0;
2004}
2005#endif
2006
Alexander Belopolsky40018472011-02-26 01:02:56 +00002007static int
Victor Stinner488fa492011-12-12 00:01:39 +01002008unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002009{
Victor Stinner488fa492011-12-12 00:01:39 +01002010 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011 if (Py_REFCNT(unicode) != 1)
2012 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002013 if (_PyUnicode_HASH(unicode) != -1)
2014 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002015 if (PyUnicode_CHECK_INTERNED(unicode))
2016 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002017 if (!PyUnicode_CheckExact(unicode))
2018 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002019#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002020 /* singleton refcount is greater than 1 */
2021 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002022#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002023 return 1;
2024}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002025
Victor Stinnerfe226c02011-10-03 03:52:20 +02002026static int
2027unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2028{
2029 PyObject *unicode;
2030 Py_ssize_t old_length;
2031
2032 assert(p_unicode != NULL);
2033 unicode = *p_unicode;
2034
2035 assert(unicode != NULL);
2036 assert(PyUnicode_Check(unicode));
2037 assert(0 <= length);
2038
Victor Stinner910337b2011-10-03 03:20:16 +02002039 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002040 old_length = PyUnicode_WSTR_LENGTH(unicode);
2041 else
2042 old_length = PyUnicode_GET_LENGTH(unicode);
2043 if (old_length == length)
2044 return 0;
2045
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002046 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002047 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002048 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002049 return 0;
2050 }
2051
Victor Stinner488fa492011-12-12 00:01:39 +01002052 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002053 PyObject *copy = resize_copy(unicode, length);
2054 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002055 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002056 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002057 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002058 }
2059
Victor Stinnerfe226c02011-10-03 03:52:20 +02002060 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002061 PyObject *new_unicode = resize_compact(unicode, length);
2062 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002063 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002064 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002065 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002066 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002067 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002068}
2069
Alexander Belopolsky40018472011-02-26 01:02:56 +00002070int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002071PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002072{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002073 PyObject *unicode;
2074 if (p_unicode == NULL) {
2075 PyErr_BadInternalCall();
2076 return -1;
2077 }
2078 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002079 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002080 {
2081 PyErr_BadInternalCall();
2082 return -1;
2083 }
2084 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002085}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002086
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002087/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002088
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002089 WARNING: The function doesn't copy the terminating null character and
2090 doesn't check the maximum character (may write a latin1 character in an
2091 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002092static void
2093unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2094 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002095{
2096 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002097 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002098 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002099
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002100 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002101 switch (kind) {
2102 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002103#ifdef Py_DEBUG
2104 if (PyUnicode_IS_ASCII(unicode)) {
2105 Py_UCS4 maxchar = ucs1lib_find_max_char(
2106 (const Py_UCS1*)str,
2107 (const Py_UCS1*)str + len);
2108 assert(maxchar < 128);
2109 }
2110#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002111 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002112 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002113 }
2114 case PyUnicode_2BYTE_KIND: {
2115 Py_UCS2 *start = (Py_UCS2 *)data + index;
2116 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002117
Victor Stinner184252a2012-06-16 02:57:41 +02002118 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002119 *ucs2 = (Py_UCS2)*str;
2120
2121 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002122 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002123 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002124 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002125 Py_UCS4 *start = (Py_UCS4 *)data + index;
2126 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002127
Victor Stinner184252a2012-06-16 02:57:41 +02002128 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002129 *ucs4 = (Py_UCS4)*str;
2130
2131 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002132 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002133 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002134 default:
2135 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002136 }
2137}
2138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002140get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002141{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002142 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002143
Victor Stinner2f9ada92020-06-24 02:22:21 +02002144 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002145 if (unicode) {
2146 Py_INCREF(unicode);
2147 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 }
Victor Stinner607b1022020-05-05 18:50:30 +02002149
2150 unicode = PyUnicode_New(1, ch);
2151 if (!unicode) {
2152 return NULL;
2153 }
2154
2155 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2156 assert(_PyUnicode_CheckConsistency(unicode, 1));
2157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002159 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002160 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002161}
2162
Victor Stinner985a82a2014-01-03 12:53:47 +01002163static PyObject*
2164unicode_char(Py_UCS4 ch)
2165{
2166 PyObject *unicode;
2167
2168 assert(ch <= MAX_UNICODE);
2169
Victor Stinner2f9ada92020-06-24 02:22:21 +02002170 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002171 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002172 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002173
Victor Stinner985a82a2014-01-03 12:53:47 +01002174 unicode = PyUnicode_New(1, ch);
2175 if (unicode == NULL)
2176 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002177
2178 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2179 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002180 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002181 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002182 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2183 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2184 }
2185 assert(_PyUnicode_CheckConsistency(unicode, 1));
2186 return unicode;
2187}
2188
Alexander Belopolsky40018472011-02-26 01:02:56 +00002189PyObject *
2190PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002192 if (u == NULL) {
2193 if (size > 0) {
2194 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2195 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2196 "use PyUnicode_New() instead", 1) < 0) {
2197 return NULL;
2198 }
2199 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002200 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002201 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002202
2203 if (size < 0) {
2204 PyErr_BadInternalCall();
2205 return NULL;
2206 }
2207
2208 return PyUnicode_FromWideChar(u, size);
2209}
2210
2211PyObject *
2212PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2213{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002214 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 Py_UCS4 maxchar = 0;
2216 Py_ssize_t num_surrogates;
2217
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002218 if (u == NULL && size != 0) {
2219 PyErr_BadInternalCall();
2220 return NULL;
2221 }
2222
2223 if (size == -1) {
2224 size = wcslen(u);
2225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002227 /* If the Unicode data is known at construction time, we can apply
2228 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002231 if (size == 0)
2232 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 /* Single character Unicode objects in the Latin-1 range are
2235 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002236 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 return get_latin1_char((unsigned char)*u);
2238
2239 /* If not empty and not single character, copy the Unicode data
2240 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002241 if (find_maxchar_surrogates(u, u + size,
2242 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 return NULL;
2244
Victor Stinner8faf8212011-12-08 22:14:11 +01002245 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246 if (!unicode)
2247 return NULL;
2248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 switch (PyUnicode_KIND(unicode)) {
2250 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002251 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2253 break;
2254 case PyUnicode_2BYTE_KIND:
2255#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002256 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002258 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2260#endif
2261 break;
2262 case PyUnicode_4BYTE_KIND:
2263#if SIZEOF_WCHAR_T == 2
2264 /* This is the only case which has to process surrogates, thus
2265 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002266 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267#else
2268 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002269 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270#endif
2271 break;
2272 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002273 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002275
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002276 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277}
2278
Alexander Belopolsky40018472011-02-26 01:02:56 +00002279PyObject *
2280PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002281{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 if (size < 0) {
2283 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002284 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 return NULL;
2286 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002287 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002288 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002289 }
2290 else {
2291 if (size > 0) {
2292 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2293 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2294 "use PyUnicode_New() instead", 1) < 0) {
2295 return NULL;
2296 }
2297 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002298 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002299 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002300}
2301
Alexander Belopolsky40018472011-02-26 01:02:56 +00002302PyObject *
2303PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002304{
2305 size_t size = strlen(u);
2306 if (size > PY_SSIZE_T_MAX) {
2307 PyErr_SetString(PyExc_OverflowError, "input too long");
2308 return NULL;
2309 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002310 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002311}
2312
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002313
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002314PyObject *
2315_PyUnicode_FromId(_Py_Identifier *id)
2316{
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002317 PyInterpreterState *interp = _PyInterpreterState_GET();
2318 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2319
2320 int index = _Py_atomic_size_get(&id->index);
2321 if (index < 0) {
2322 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2323
2324 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2325 // Check again to detect concurrent access. Another thread can have
2326 // initialized the index while this thread waited for the lock.
2327 index = _Py_atomic_size_get(&id->index);
2328 if (index < 0) {
2329 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2330 index = rt_ids->next_index;
2331 rt_ids->next_index++;
2332 _Py_atomic_size_set(&id->index, index);
2333 }
2334 PyThread_release_lock(rt_ids->lock);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002335 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002336 assert(index >= 0);
Victor Stinner297257f2020-06-02 14:39:45 +02002337
2338 PyObject *obj;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002339 if (index < ids->size) {
2340 obj = ids->array[index];
2341 if (obj) {
2342 // Return a borrowed reference
2343 return obj;
2344 }
2345 }
2346
2347 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
Victor Stinner297257f2020-06-02 14:39:45 +02002348 NULL, NULL);
2349 if (!obj) {
2350 return NULL;
2351 }
2352 PyUnicode_InternInPlace(&obj);
2353
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002354 if (index >= ids->size) {
2355 // Overallocate to reduce the number of realloc
2356 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2357 Py_ssize_t item_size = sizeof(ids->array[0]);
2358 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2359 if (new_array == NULL) {
2360 PyErr_NoMemory();
2361 return NULL;
2362 }
2363 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2364 ids->array = new_array;
2365 ids->size = new_size;
2366 }
2367
2368 // The array stores a strong reference
2369 ids->array[index] = obj;
2370
2371 // Return a borrowed reference
2372 return obj;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002373}
2374
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002375
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002376static void
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002377unicode_clear_identifiers(PyThreadState *tstate)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002378{
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002379 PyInterpreterState *interp = _PyInterpreterState_GET();
2380 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2381 for (Py_ssize_t i=0; i < ids->size; i++) {
2382 Py_XDECREF(ids->array[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002383 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002384 ids->size = 0;
2385 PyMem_Free(ids->array);
2386 ids->array = NULL;
2387 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2388 // after Py_Finalize().
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002389}
2390
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002391
Benjamin Peterson0df54292012-03-26 14:50:32 -04002392/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002393
Victor Stinnerd3f08822012-05-29 12:57:52 +02002394PyObject*
2395_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002396{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002397 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002398 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002399 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002400#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002401 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002402#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002403 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002404 }
Victor Stinner785938e2011-12-11 20:09:03 +01002405 unicode = PyUnicode_New(size, 127);
2406 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002407 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002408 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2409 assert(_PyUnicode_CheckConsistency(unicode, 1));
2410 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002411}
2412
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002413static Py_UCS4
2414kind_maxchar_limit(unsigned int kind)
2415{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002416 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002417 case PyUnicode_1BYTE_KIND:
2418 return 0x80;
2419 case PyUnicode_2BYTE_KIND:
2420 return 0x100;
2421 case PyUnicode_4BYTE_KIND:
2422 return 0x10000;
2423 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002424 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002425 }
2426}
2427
Victor Stinner702c7342011-10-05 13:50:52 +02002428static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002429_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002431 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002432 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002433
Victor Stinner2f9ada92020-06-24 02:22:21 +02002434 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002435 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002436 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002437 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002438 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002439 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002440 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002441
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002442 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002443 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 if (!res)
2445 return NULL;
2446 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002447 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002449}
2450
Victor Stinnere57b1c02011-09-28 22:20:48 +02002451static PyObject*
2452_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453{
2454 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002455 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002456
Serhiy Storchaka678db842013-01-26 12:16:36 +02002457 if (size == 0)
2458 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002459 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002460 if (size == 1)
2461 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002462
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002463 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002464 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 if (!res)
2466 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002467 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002469 else {
2470 _PyUnicode_CONVERT_BYTES(
2471 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2472 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002473 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 return res;
2475}
2476
Victor Stinnere57b1c02011-09-28 22:20:48 +02002477static PyObject*
2478_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479{
2480 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002481 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002482
Serhiy Storchaka678db842013-01-26 12:16:36 +02002483 if (size == 0)
2484 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002485 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002486 if (size == 1)
2487 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002488
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002489 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002490 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 if (!res)
2492 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002493 if (max_char < 256)
2494 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2495 PyUnicode_1BYTE_DATA(res));
2496 else if (max_char < 0x10000)
2497 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2498 PyUnicode_2BYTE_DATA(res));
2499 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002501 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 return res;
2503}
2504
2505PyObject*
2506PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2507{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002508 if (size < 0) {
2509 PyErr_SetString(PyExc_ValueError, "size must be positive");
2510 return NULL;
2511 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002512 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002514 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002516 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002518 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002519 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002520 PyErr_SetString(PyExc_SystemError, "invalid kind");
2521 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523}
2524
Victor Stinnerece58de2012-04-23 23:36:38 +02002525Py_UCS4
2526_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2527{
2528 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002529 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002530
2531 assert(PyUnicode_IS_READY(unicode));
2532 assert(0 <= start);
2533 assert(end <= PyUnicode_GET_LENGTH(unicode));
2534 assert(start <= end);
2535
2536 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2537 return PyUnicode_MAX_CHAR_VALUE(unicode);
2538
2539 if (start == end)
2540 return 127;
2541
Victor Stinner94d558b2012-04-27 22:26:58 +02002542 if (PyUnicode_IS_ASCII(unicode))
2543 return 127;
2544
Victor Stinnerece58de2012-04-23 23:36:38 +02002545 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002546 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002547 endptr = (char *)startptr + end * kind;
2548 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002549 switch(kind) {
2550 case PyUnicode_1BYTE_KIND:
2551 return ucs1lib_find_max_char(startptr, endptr);
2552 case PyUnicode_2BYTE_KIND:
2553 return ucs2lib_find_max_char(startptr, endptr);
2554 case PyUnicode_4BYTE_KIND:
2555 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002556 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002557 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002558 }
2559}
2560
Victor Stinner25a4b292011-10-06 12:31:55 +02002561/* Ensure that a string uses the most efficient storage, if it is not the
2562 case: create a new string with of the right kind. Write NULL into *p_unicode
2563 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002564static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002565unicode_adjust_maxchar(PyObject **p_unicode)
2566{
2567 PyObject *unicode, *copy;
2568 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002569 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002570 unsigned int kind;
2571
2572 assert(p_unicode != NULL);
2573 unicode = *p_unicode;
2574 assert(PyUnicode_IS_READY(unicode));
2575 if (PyUnicode_IS_ASCII(unicode))
2576 return;
2577
2578 len = PyUnicode_GET_LENGTH(unicode);
2579 kind = PyUnicode_KIND(unicode);
2580 if (kind == PyUnicode_1BYTE_KIND) {
2581 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002582 max_char = ucs1lib_find_max_char(u, u + len);
2583 if (max_char >= 128)
2584 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002585 }
2586 else if (kind == PyUnicode_2BYTE_KIND) {
2587 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002588 max_char = ucs2lib_find_max_char(u, u + len);
2589 if (max_char >= 256)
2590 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002591 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002592 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002593 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002594 max_char = ucs4lib_find_max_char(u, u + len);
2595 if (max_char >= 0x10000)
2596 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002597 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002598 else
2599 Py_UNREACHABLE();
2600
Victor Stinner25a4b292011-10-06 12:31:55 +02002601 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002602 if (copy != NULL)
2603 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002604 Py_DECREF(unicode);
2605 *p_unicode = copy;
2606}
2607
Victor Stinner034f6cf2011-09-30 02:26:44 +02002608PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002609_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002610{
Victor Stinner87af4f22011-11-21 23:03:47 +01002611 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002612 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002613
Victor Stinner034f6cf2011-09-30 02:26:44 +02002614 if (!PyUnicode_Check(unicode)) {
2615 PyErr_BadInternalCall();
2616 return NULL;
2617 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002618 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002619 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002620
Victor Stinner87af4f22011-11-21 23:03:47 +01002621 length = PyUnicode_GET_LENGTH(unicode);
2622 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002623 if (!copy)
2624 return NULL;
2625 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2626
Christian Heimesf051e432016-09-13 20:22:02 +02002627 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002628 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002629 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002630 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002631}
2632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633
Victor Stinnerbc603d12011-10-02 01:00:40 +02002634/* Widen Unicode objects to larger buffers. Don't write terminating null
2635 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002637static void*
2638unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002640 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002641
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002642 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002643 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002644 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002645 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002646 if (!result)
2647 return PyErr_NoMemory();
2648 assert(skind == PyUnicode_1BYTE_KIND);
2649 _PyUnicode_CONVERT_BYTES(
2650 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002651 (const Py_UCS1 *)data,
2652 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002653 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002655 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002656 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002657 if (!result)
2658 return PyErr_NoMemory();
2659 if (skind == PyUnicode_2BYTE_KIND) {
2660 _PyUnicode_CONVERT_BYTES(
2661 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002662 (const Py_UCS2 *)data,
2663 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002664 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002665 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002666 else {
2667 assert(skind == PyUnicode_1BYTE_KIND);
2668 _PyUnicode_CONVERT_BYTES(
2669 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002670 (const Py_UCS1 *)data,
2671 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002672 result);
2673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002675 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002676 Py_UNREACHABLE();
2677 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002679}
2680
2681static Py_UCS4*
2682as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2683 int copy_null)
2684{
2685 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002686 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687 Py_ssize_t len, targetlen;
2688 if (PyUnicode_READY(string) == -1)
2689 return NULL;
2690 kind = PyUnicode_KIND(string);
2691 data = PyUnicode_DATA(string);
2692 len = PyUnicode_GET_LENGTH(string);
2693 targetlen = len;
2694 if (copy_null)
2695 targetlen++;
2696 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002697 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 if (!target) {
2699 PyErr_NoMemory();
2700 return NULL;
2701 }
2702 }
2703 else {
2704 if (targetsize < targetlen) {
2705 PyErr_Format(PyExc_SystemError,
2706 "string is longer than the buffer");
2707 if (copy_null && 0 < targetsize)
2708 target[0] = 0;
2709 return NULL;
2710 }
2711 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002712 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002713 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002714 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002716 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002717 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002718 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2719 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002720 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002721 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002722 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002723 else {
2724 Py_UNREACHABLE();
2725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002726 if (copy_null)
2727 target[len] = 0;
2728 return target;
2729}
2730
2731Py_UCS4*
2732PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2733 int copy_null)
2734{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002735 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 PyErr_BadInternalCall();
2737 return NULL;
2738 }
2739 return as_ucs4(string, target, targetsize, copy_null);
2740}
2741
2742Py_UCS4*
2743PyUnicode_AsUCS4Copy(PyObject *string)
2744{
2745 return as_ucs4(string, NULL, 0, 1);
2746}
2747
Victor Stinner15a11362012-10-06 23:48:20 +02002748/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002749 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2750 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2751#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002752
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002753static int
2754unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2755 Py_ssize_t width, Py_ssize_t precision)
2756{
2757 Py_ssize_t length, fill, arglen;
2758 Py_UCS4 maxchar;
2759
2760 if (PyUnicode_READY(str) == -1)
2761 return -1;
2762
2763 length = PyUnicode_GET_LENGTH(str);
2764 if ((precision == -1 || precision >= length)
2765 && width <= length)
2766 return _PyUnicodeWriter_WriteStr(writer, str);
2767
2768 if (precision != -1)
2769 length = Py_MIN(precision, length);
2770
2771 arglen = Py_MAX(length, width);
2772 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2773 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2774 else
2775 maxchar = writer->maxchar;
2776
2777 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2778 return -1;
2779
2780 if (width > length) {
2781 fill = width - length;
2782 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2783 return -1;
2784 writer->pos += fill;
2785 }
2786
2787 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2788 str, 0, length);
2789 writer->pos += length;
2790 return 0;
2791}
2792
2793static int
Victor Stinner998b8062018-09-12 00:23:25 +02002794unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002795 Py_ssize_t width, Py_ssize_t precision)
2796{
2797 /* UTF-8 */
2798 Py_ssize_t length;
2799 PyObject *unicode;
2800 int res;
2801
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002802 if (precision == -1) {
2803 length = strlen(str);
2804 }
2805 else {
2806 length = 0;
2807 while (length < precision && str[length]) {
2808 length++;
2809 }
2810 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002811 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2812 if (unicode == NULL)
2813 return -1;
2814
2815 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2816 Py_DECREF(unicode);
2817 return res;
2818}
2819
Victor Stinner96865452011-03-01 23:44:09 +00002820static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002821unicode_fromformat_arg(_PyUnicodeWriter *writer,
2822 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002823{
Victor Stinnere215d962012-10-06 23:03:36 +02002824 const char *p;
2825 Py_ssize_t len;
2826 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002827 Py_ssize_t width;
2828 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002829 int longflag;
2830 int longlongflag;
2831 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002832 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002833
2834 p = f;
2835 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002836 zeropad = 0;
2837 if (*f == '0') {
2838 zeropad = 1;
2839 f++;
2840 }
Victor Stinner96865452011-03-01 23:44:09 +00002841
2842 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002843 width = -1;
2844 if (Py_ISDIGIT((unsigned)*f)) {
2845 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002846 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002847 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002848 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002849 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002850 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002851 return NULL;
2852 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002853 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002854 f++;
2855 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002856 }
2857 precision = -1;
2858 if (*f == '.') {
2859 f++;
2860 if (Py_ISDIGIT((unsigned)*f)) {
2861 precision = (*f - '0');
2862 f++;
2863 while (Py_ISDIGIT((unsigned)*f)) {
2864 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2865 PyErr_SetString(PyExc_ValueError,
2866 "precision too big");
2867 return NULL;
2868 }
2869 precision = (precision * 10) + (*f - '0');
2870 f++;
2871 }
2872 }
Victor Stinner96865452011-03-01 23:44:09 +00002873 if (*f == '%') {
2874 /* "%.3%s" => f points to "3" */
2875 f--;
2876 }
2877 }
2878 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002879 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002880 f--;
2881 }
Victor Stinner96865452011-03-01 23:44:09 +00002882
2883 /* Handle %ld, %lu, %lld and %llu. */
2884 longflag = 0;
2885 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002886 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002887 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002888 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002889 longflag = 1;
2890 ++f;
2891 }
Victor Stinner96865452011-03-01 23:44:09 +00002892 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002893 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002894 longlongflag = 1;
2895 f += 2;
2896 }
Victor Stinner96865452011-03-01 23:44:09 +00002897 }
2898 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002899 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002900 size_tflag = 1;
2901 ++f;
2902 }
Victor Stinnere215d962012-10-06 23:03:36 +02002903
2904 if (f[1] == '\0')
2905 writer->overallocate = 0;
2906
2907 switch (*f) {
2908 case 'c':
2909 {
2910 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002911 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002912 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002913 "character argument not in range(0x110000)");
2914 return NULL;
2915 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002916 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002917 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002918 break;
2919 }
2920
2921 case 'i':
2922 case 'd':
2923 case 'u':
2924 case 'x':
2925 {
2926 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002927 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002928 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002929
2930 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002931 if (longflag) {
2932 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2933 }
2934 else if (longlongflag) {
2935 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2936 }
2937 else if (size_tflag) {
2938 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2939 }
2940 else {
2941 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2942 }
Victor Stinnere215d962012-10-06 23:03:36 +02002943 }
2944 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002945 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002946 }
2947 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002948 if (longflag) {
2949 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2950 }
2951 else if (longlongflag) {
2952 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2953 }
2954 else if (size_tflag) {
2955 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2956 }
2957 else {
2958 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2959 }
Victor Stinnere215d962012-10-06 23:03:36 +02002960 }
2961 assert(len >= 0);
2962
Victor Stinnere215d962012-10-06 23:03:36 +02002963 if (precision < len)
2964 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002965
2966 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002967 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2968 return NULL;
2969
Victor Stinnere215d962012-10-06 23:03:36 +02002970 if (width > precision) {
2971 Py_UCS4 fillchar;
2972 fill = width - precision;
2973 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002974 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2975 return NULL;
2976 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002977 }
Victor Stinner15a11362012-10-06 23:48:20 +02002978 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002979 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002980 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2981 return NULL;
2982 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002983 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002984
Victor Stinner4a587072013-11-19 12:54:53 +01002985 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2986 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002987 break;
2988 }
2989
2990 case 'p':
2991 {
2992 char number[MAX_LONG_LONG_CHARS];
2993
2994 len = sprintf(number, "%p", va_arg(*vargs, void*));
2995 assert(len >= 0);
2996
2997 /* %p is ill-defined: ensure leading 0x. */
2998 if (number[1] == 'X')
2999 number[1] = 'x';
3000 else if (number[1] != 'x') {
3001 memmove(number + 2, number,
3002 strlen(number) + 1);
3003 number[0] = '0';
3004 number[1] = 'x';
3005 len += 2;
3006 }
3007
Victor Stinner4a587072013-11-19 12:54:53 +01003008 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003009 return NULL;
3010 break;
3011 }
3012
3013 case 's':
3014 {
3015 /* UTF-8 */
3016 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02003017 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003018 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003019 break;
3020 }
3021
3022 case 'U':
3023 {
3024 PyObject *obj = va_arg(*vargs, PyObject *);
3025 assert(obj && _PyUnicode_CHECK(obj));
3026
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003027 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003028 return NULL;
3029 break;
3030 }
3031
3032 case 'V':
3033 {
3034 PyObject *obj = va_arg(*vargs, PyObject *);
3035 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02003036 if (obj) {
3037 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003038 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003039 return NULL;
3040 }
3041 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003042 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02003043 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003044 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003045 }
3046 break;
3047 }
3048
3049 case 'S':
3050 {
3051 PyObject *obj = va_arg(*vargs, PyObject *);
3052 PyObject *str;
3053 assert(obj);
3054 str = PyObject_Str(obj);
3055 if (!str)
3056 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003057 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003058 Py_DECREF(str);
3059 return NULL;
3060 }
3061 Py_DECREF(str);
3062 break;
3063 }
3064
3065 case 'R':
3066 {
3067 PyObject *obj = va_arg(*vargs, PyObject *);
3068 PyObject *repr;
3069 assert(obj);
3070 repr = PyObject_Repr(obj);
3071 if (!repr)
3072 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003073 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003074 Py_DECREF(repr);
3075 return NULL;
3076 }
3077 Py_DECREF(repr);
3078 break;
3079 }
3080
3081 case 'A':
3082 {
3083 PyObject *obj = va_arg(*vargs, PyObject *);
3084 PyObject *ascii;
3085 assert(obj);
3086 ascii = PyObject_ASCII(obj);
3087 if (!ascii)
3088 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003089 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003090 Py_DECREF(ascii);
3091 return NULL;
3092 }
3093 Py_DECREF(ascii);
3094 break;
3095 }
3096
3097 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003098 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003099 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003100 break;
3101
3102 default:
3103 /* if we stumble upon an unknown formatting code, copy the rest
3104 of the format string to the output string. (we cannot just
3105 skip the code, since there's no way to know what's in the
3106 argument list) */
3107 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003108 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003109 return NULL;
3110 f = p+len;
3111 return f;
3112 }
3113
3114 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003115 return f;
3116}
3117
Walter Dörwaldd2034312007-05-18 16:29:38 +00003118PyObject *
3119PyUnicode_FromFormatV(const char *format, va_list vargs)
3120{
Victor Stinnere215d962012-10-06 23:03:36 +02003121 va_list vargs2;
3122 const char *f;
3123 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003124
Victor Stinner8f674cc2013-04-17 23:02:17 +02003125 _PyUnicodeWriter_Init(&writer);
3126 writer.min_length = strlen(format) + 100;
3127 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003128
Benjamin Peterson0c212142016-09-20 20:39:33 -07003129 // Copy varags to be able to pass a reference to a subfunction.
3130 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003131
3132 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003133 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003134 f = unicode_fromformat_arg(&writer, f, &vargs2);
3135 if (f == NULL)
3136 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003138 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003139 const char *p;
3140 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003141
Victor Stinnere215d962012-10-06 23:03:36 +02003142 p = f;
3143 do
3144 {
3145 if ((unsigned char)*p > 127) {
3146 PyErr_Format(PyExc_ValueError,
3147 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3148 "string, got a non-ASCII byte: 0x%02x",
3149 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003150 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003151 }
3152 p++;
3153 }
3154 while (*p != '\0' && *p != '%');
3155 len = p - f;
3156
3157 if (*p == '\0')
3158 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003159
3160 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003161 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003162
3163 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003164 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003165 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003166 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003167 return _PyUnicodeWriter_Finish(&writer);
3168
3169 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003170 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003171 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003172 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003173}
3174
Walter Dörwaldd2034312007-05-18 16:29:38 +00003175PyObject *
3176PyUnicode_FromFormat(const char *format, ...)
3177{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003178 PyObject* ret;
3179 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003180
3181#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003182 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003183#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003184 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003185#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003186 ret = PyUnicode_FromFormatV(format, vargs);
3187 va_end(vargs);
3188 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003189}
3190
Serhiy Storchakac46db922018-10-23 22:58:24 +03003191static Py_ssize_t
3192unicode_get_widechar_size(PyObject *unicode)
3193{
3194 Py_ssize_t res;
3195
3196 assert(unicode != NULL);
3197 assert(_PyUnicode_CHECK(unicode));
3198
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003199#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchakac46db922018-10-23 22:58:24 +03003200 if (_PyUnicode_WSTR(unicode) != NULL) {
3201 return PyUnicode_WSTR_LENGTH(unicode);
3202 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003203#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003204 assert(PyUnicode_IS_READY(unicode));
3205
3206 res = _PyUnicode_LENGTH(unicode);
3207#if SIZEOF_WCHAR_T == 2
3208 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3209 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3210 const Py_UCS4 *end = s + res;
3211 for (; s < end; ++s) {
3212 if (*s > 0xFFFF) {
3213 ++res;
3214 }
3215 }
3216 }
3217#endif
3218 return res;
3219}
3220
3221static void
3222unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3223{
Serhiy Storchakac46db922018-10-23 22:58:24 +03003224 assert(unicode != NULL);
3225 assert(_PyUnicode_CHECK(unicode));
3226
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003227#if USE_UNICODE_WCHAR_CACHE
3228 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003229 if (wstr != NULL) {
3230 memcpy(w, wstr, size * sizeof(wchar_t));
3231 return;
3232 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003233#else /* USE_UNICODE_WCHAR_CACHE */
3234 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3235 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3236 return;
3237 }
3238#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003239 assert(PyUnicode_IS_READY(unicode));
3240
3241 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3242 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3243 for (; size--; ++s, ++w) {
3244 *w = *s;
3245 }
3246 }
3247 else {
3248#if SIZEOF_WCHAR_T == 4
3249 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3250 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3251 for (; size--; ++s, ++w) {
3252 *w = *s;
3253 }
3254#else
3255 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3256 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3257 for (; size--; ++s, ++w) {
3258 Py_UCS4 ch = *s;
3259 if (ch > 0xFFFF) {
3260 assert(ch <= MAX_UNICODE);
3261 /* encode surrogate pair in this case */
3262 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3263 if (!size--)
3264 break;
3265 *w = Py_UNICODE_LOW_SURROGATE(ch);
3266 }
3267 else {
3268 *w = ch;
3269 }
3270 }
3271#endif
3272 }
3273}
3274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003275#ifdef HAVE_WCHAR_H
3276
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003277/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003278
Victor Stinnerd88d9832011-09-06 02:00:05 +02003279 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003280 character) required to convert the unicode object. Ignore size argument.
3281
Victor Stinnerd88d9832011-09-06 02:00:05 +02003282 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003283 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003284 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003285Py_ssize_t
3286PyUnicode_AsWideChar(PyObject *unicode,
3287 wchar_t *w,
3288 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003289{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003290 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003291
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003292 if (unicode == NULL) {
3293 PyErr_BadInternalCall();
3294 return -1;
3295 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003296 if (!PyUnicode_Check(unicode)) {
3297 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003298 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003299 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003300
3301 res = unicode_get_widechar_size(unicode);
3302 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003303 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003304 }
3305
3306 if (size > res) {
3307 size = res + 1;
3308 }
3309 else {
3310 res = size;
3311 }
3312 unicode_copy_as_widechar(unicode, w, size);
3313 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003314}
3315
Victor Stinner137c34c2010-09-29 10:25:54 +00003316wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003317PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003318 Py_ssize_t *size)
3319{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003320 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003321 Py_ssize_t buflen;
3322
3323 if (unicode == NULL) {
3324 PyErr_BadInternalCall();
3325 return NULL;
3326 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003327 if (!PyUnicode_Check(unicode)) {
3328 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003329 return NULL;
3330 }
3331
Serhiy Storchakac46db922018-10-23 22:58:24 +03003332 buflen = unicode_get_widechar_size(unicode);
3333 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003334 if (buffer == NULL) {
3335 PyErr_NoMemory();
3336 return NULL;
3337 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003338 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3339 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003340 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003341 }
3342 else if (wcslen(buffer) != (size_t)buflen) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01003343 PyMem_Free(buffer);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003344 PyErr_SetString(PyExc_ValueError,
3345 "embedded null character");
3346 return NULL;
3347 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003348 return buffer;
3349}
3350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003353int
3354_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3355{
3356 wchar_t **p = (wchar_t **)ptr;
3357 if (obj == NULL) {
3358#if !USE_UNICODE_WCHAR_CACHE
3359 PyMem_Free(*p);
3360#endif /* USE_UNICODE_WCHAR_CACHE */
3361 *p = NULL;
3362 return 1;
3363 }
3364 if (PyUnicode_Check(obj)) {
3365#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003366 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3367 if (*p == NULL) {
3368 return 0;
3369 }
3370 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003371#else /* USE_UNICODE_WCHAR_CACHE */
3372 *p = PyUnicode_AsWideCharString(obj, NULL);
3373 if (*p == NULL) {
3374 return 0;
3375 }
3376 return Py_CLEANUP_SUPPORTED;
3377#endif /* USE_UNICODE_WCHAR_CACHE */
3378 }
3379 PyErr_Format(PyExc_TypeError,
3380 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003381 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003382 return 0;
3383}
3384
3385int
3386_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3387{
3388 wchar_t **p = (wchar_t **)ptr;
3389 if (obj == NULL) {
3390#if !USE_UNICODE_WCHAR_CACHE
3391 PyMem_Free(*p);
3392#endif /* USE_UNICODE_WCHAR_CACHE */
3393 *p = NULL;
3394 return 1;
3395 }
3396 if (obj == Py_None) {
3397 *p = NULL;
3398 return 1;
3399 }
3400 if (PyUnicode_Check(obj)) {
3401#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003402 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3403 if (*p == NULL) {
3404 return 0;
3405 }
3406 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003407#else /* USE_UNICODE_WCHAR_CACHE */
3408 *p = PyUnicode_AsWideCharString(obj, NULL);
3409 if (*p == NULL) {
3410 return 0;
3411 }
3412 return Py_CLEANUP_SUPPORTED;
3413#endif /* USE_UNICODE_WCHAR_CACHE */
3414 }
3415 PyErr_Format(PyExc_TypeError,
3416 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003417 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003418 return 0;
3419}
3420
Alexander Belopolsky40018472011-02-26 01:02:56 +00003421PyObject *
3422PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003423{
Victor Stinner8faf8212011-12-08 22:14:11 +01003424 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003425 PyErr_SetString(PyExc_ValueError,
3426 "chr() arg not in range(0x110000)");
3427 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003428 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003429
Victor Stinner985a82a2014-01-03 12:53:47 +01003430 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003431}
3432
Alexander Belopolsky40018472011-02-26 01:02:56 +00003433PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003434PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003436 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003437 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003438 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003439 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003440 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003441 Py_INCREF(obj);
3442 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003443 }
3444 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003445 /* For a Unicode subtype that's not a Unicode object,
3446 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003447 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003448 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003449 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003450 "Can't convert '%.100s' object to str implicitly",
3451 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003452 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003453}
3454
Alexander Belopolsky40018472011-02-26 01:02:56 +00003455PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003456PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003457 const char *encoding,
3458 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003459{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003460 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003461 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003462
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 PyErr_BadInternalCall();
3465 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003467
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003468 /* Decoding bytes objects is the most common case and should be fast */
3469 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003470 if (PyBytes_GET_SIZE(obj) == 0) {
3471 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3472 return NULL;
3473 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003474 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003475 }
3476 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003477 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3478 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003479 }
3480
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003481 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003482 PyErr_SetString(PyExc_TypeError,
3483 "decoding str is not supported");
3484 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003485 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003486
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003487 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3488 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3489 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003490 "decoding to str: need a bytes-like object, %.80s found",
3491 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003492 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003493 }
Tim Petersced69f82003-09-16 20:30:58 +00003494
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003495 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003496 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003497 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3498 return NULL;
3499 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003500 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003502
Serhiy Storchaka05997252013-01-26 12:14:02 +02003503 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003504 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003505 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506}
3507
Victor Stinnerebe17e02016-10-12 13:57:45 +02003508/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3509 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3510 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003511int
3512_Py_normalize_encoding(const char *encoding,
3513 char *lower,
3514 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003516 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003517 char *l;
3518 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003519 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520
Victor Stinner942889a2016-09-05 15:40:10 -07003521 assert(encoding != NULL);
3522
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003523 e = encoding;
3524 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003525 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003526 punct = 0;
3527 while (1) {
3528 char c = *e;
3529 if (c == 0) {
3530 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003531 }
Victor Stinner942889a2016-09-05 15:40:10 -07003532
3533 if (Py_ISALNUM(c) || c == '.') {
3534 if (punct && l != lower) {
3535 if (l == l_end) {
3536 return 0;
3537 }
3538 *l++ = '_';
3539 }
3540 punct = 0;
3541
3542 if (l == l_end) {
3543 return 0;
3544 }
3545 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003546 }
3547 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003548 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003549 }
Victor Stinner942889a2016-09-05 15:40:10 -07003550
3551 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003552 }
3553 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003554 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003555}
3556
Alexander Belopolsky40018472011-02-26 01:02:56 +00003557PyObject *
3558PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003559 Py_ssize_t size,
3560 const char *encoding,
3561 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003562{
3563 PyObject *buffer = NULL, *unicode;
3564 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003565 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3566
Victor Stinner22eb6892019-06-26 00:51:05 +02003567 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3568 return NULL;
3569 }
3570
Victor Stinnered076ed2019-06-26 01:49:32 +02003571 if (size == 0) {
3572 _Py_RETURN_UNICODE_EMPTY();
3573 }
3574
Victor Stinner942889a2016-09-05 15:40:10 -07003575 if (encoding == NULL) {
3576 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3577 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003578
Fred Drakee4315f52000-05-09 19:53:39 +00003579 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003580 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3581 char *lower = buflower;
3582
3583 /* Fast paths */
3584 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3585 lower += 3;
3586 if (*lower == '_') {
3587 /* Match "utf8" and "utf_8" */
3588 lower++;
3589 }
3590
3591 if (lower[0] == '8' && lower[1] == 0) {
3592 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3593 }
3594 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3595 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3596 }
3597 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3598 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3599 }
3600 }
3601 else {
3602 if (strcmp(lower, "ascii") == 0
3603 || strcmp(lower, "us_ascii") == 0) {
3604 return PyUnicode_DecodeASCII(s, size, errors);
3605 }
Steve Dowercc16be82016-09-08 10:35:16 -07003606 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003607 else if (strcmp(lower, "mbcs") == 0) {
3608 return PyUnicode_DecodeMBCS(s, size, errors);
3609 }
3610 #endif
3611 else if (strcmp(lower, "latin1") == 0
3612 || strcmp(lower, "latin_1") == 0
3613 || strcmp(lower, "iso_8859_1") == 0
3614 || strcmp(lower, "iso8859_1") == 0) {
3615 return PyUnicode_DecodeLatin1(s, size, errors);
3616 }
3617 }
Victor Stinner37296e82010-06-10 13:36:23 +00003618 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619
3620 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003621 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003622 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003623 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003624 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625 if (buffer == NULL)
3626 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003627 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 if (unicode == NULL)
3629 goto onError;
3630 if (!PyUnicode_Check(unicode)) {
3631 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003632 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003633 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003634 encoding,
3635 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 Py_DECREF(unicode);
3637 goto onError;
3638 }
3639 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003640 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003641
Benjamin Peterson29060642009-01-31 22:14:21 +00003642 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643 Py_XDECREF(buffer);
3644 return NULL;
3645}
3646
Alexander Belopolsky40018472011-02-26 01:02:56 +00003647PyObject *
3648PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003649 const char *encoding,
3650 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003651{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003652 if (!PyUnicode_Check(unicode)) {
3653 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003654 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003655 }
3656
Serhiy Storchaka00939072016-10-27 21:05:49 +03003657 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3658 "PyUnicode_AsDecodedObject() is deprecated; "
3659 "use PyCodec_Decode() to decode from str", 1) < 0)
3660 return NULL;
3661
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003662 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003663 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003664
3665 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003666 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003667}
3668
Alexander Belopolsky40018472011-02-26 01:02:56 +00003669PyObject *
3670PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003671 const char *encoding,
3672 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003673{
3674 PyObject *v;
3675
3676 if (!PyUnicode_Check(unicode)) {
3677 PyErr_BadArgument();
3678 goto onError;
3679 }
3680
Serhiy Storchaka00939072016-10-27 21:05:49 +03003681 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3682 "PyUnicode_AsDecodedUnicode() is deprecated; "
3683 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3684 return NULL;
3685
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003686 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003687 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003688
3689 /* Decode via the codec registry */
3690 v = PyCodec_Decode(unicode, encoding, errors);
3691 if (v == NULL)
3692 goto onError;
3693 if (!PyUnicode_Check(v)) {
3694 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003695 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003696 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003697 encoding,
3698 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003699 Py_DECREF(v);
3700 goto onError;
3701 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003702 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003703
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003705 return NULL;
3706}
3707
Alexander Belopolsky40018472011-02-26 01:02:56 +00003708PyObject *
3709PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003710 Py_ssize_t size,
3711 const char *encoding,
3712 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713{
3714 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003715
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003716 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003718 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3720 Py_DECREF(unicode);
3721 return v;
3722}
3723
Alexander Belopolsky40018472011-02-26 01:02:56 +00003724PyObject *
3725PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003726 const char *encoding,
3727 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003728{
3729 PyObject *v;
3730
3731 if (!PyUnicode_Check(unicode)) {
3732 PyErr_BadArgument();
3733 goto onError;
3734 }
3735
Serhiy Storchaka00939072016-10-27 21:05:49 +03003736 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3737 "PyUnicode_AsEncodedObject() is deprecated; "
3738 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3739 "or PyCodec_Encode() for generic encoding", 1) < 0)
3740 return NULL;
3741
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003742 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003743 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003744
3745 /* Encode via the codec registry */
3746 v = PyCodec_Encode(unicode, encoding, errors);
3747 if (v == NULL)
3748 goto onError;
3749 return v;
3750
Benjamin Peterson29060642009-01-31 22:14:21 +00003751 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003752 return NULL;
3753}
3754
Victor Stinner1b579672011-12-17 05:47:23 +01003755
Victor Stinner2cba6b82018-01-10 22:46:15 +01003756static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003757unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003758 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003759{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003760 Py_ssize_t wlen;
3761 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3762 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003763 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003764 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003765
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003766 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003767 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003768 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003769 return NULL;
3770 }
3771
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003772 char *str;
3773 size_t error_pos;
3774 const char *reason;
3775 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003776 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003777 PyMem_Free(wstr);
3778
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003779 if (res != 0) {
3780 if (res == -2) {
3781 PyObject *exc;
3782 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3783 "locale", unicode,
3784 (Py_ssize_t)error_pos,
3785 (Py_ssize_t)(error_pos+1),
3786 reason);
3787 if (exc != NULL) {
3788 PyCodec_StrictErrors(exc);
3789 Py_DECREF(exc);
3790 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003791 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003792 else if (res == -3) {
3793 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3794 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003795 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003796 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003797 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003798 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003799 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003800
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003801 PyObject *bytes = PyBytes_FromString(str);
3802 PyMem_RawFree(str);
3803 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003804}
3805
Victor Stinnerad158722010-10-27 00:25:46 +00003806PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003807PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3808{
Victor Stinner709d23d2019-05-02 14:56:30 -04003809 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3810 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003811}
3812
3813PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003814PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003815{
Victor Stinner81a7be32020-04-14 15:14:01 +02003816 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003817 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3818 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003819 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003820 fs_codec->error_handler,
3821 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003822 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003823#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003824 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003825 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003826 fs_codec->encoding,
3827 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003828 }
Victor Stinnerad158722010-10-27 00:25:46 +00003829#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003830 else {
3831 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3832 machinery is not ready and so cannot be used:
3833 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003834 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3835 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003836 assert(filesystem_errors != NULL);
3837 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3838 assert(errors != _Py_ERROR_UNKNOWN);
3839#ifdef _Py_FORCE_UTF8_FS_ENCODING
3840 return unicode_encode_utf8(unicode, errors, NULL);
3841#else
3842 return unicode_encode_locale(unicode, errors, 0);
3843#endif
3844 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003845}
3846
Alexander Belopolsky40018472011-02-26 01:02:56 +00003847PyObject *
3848PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003849 const char *encoding,
3850 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851{
3852 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003853 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003854
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 if (!PyUnicode_Check(unicode)) {
3856 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 }
Fred Drakee4315f52000-05-09 19:53:39 +00003859
Victor Stinner22eb6892019-06-26 00:51:05 +02003860 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3861 return NULL;
3862 }
3863
Victor Stinner942889a2016-09-05 15:40:10 -07003864 if (encoding == NULL) {
3865 return _PyUnicode_AsUTF8String(unicode, errors);
3866 }
3867
Fred Drakee4315f52000-05-09 19:53:39 +00003868 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003869 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3870 char *lower = buflower;
3871
3872 /* Fast paths */
3873 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3874 lower += 3;
3875 if (*lower == '_') {
3876 /* Match "utf8" and "utf_8" */
3877 lower++;
3878 }
3879
3880 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003882 }
3883 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3884 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3885 }
3886 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3887 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3888 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003889 }
Victor Stinner942889a2016-09-05 15:40:10 -07003890 else {
3891 if (strcmp(lower, "ascii") == 0
3892 || strcmp(lower, "us_ascii") == 0) {
3893 return _PyUnicode_AsASCIIString(unicode, errors);
3894 }
Steve Dowercc16be82016-09-08 10:35:16 -07003895#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003896 else if (strcmp(lower, "mbcs") == 0) {
3897 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3898 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003899#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003900 else if (strcmp(lower, "latin1") == 0 ||
3901 strcmp(lower, "latin_1") == 0 ||
3902 strcmp(lower, "iso_8859_1") == 0 ||
3903 strcmp(lower, "iso8859_1") == 0) {
3904 return _PyUnicode_AsLatin1String(unicode, errors);
3905 }
3906 }
Victor Stinner37296e82010-06-10 13:36:23 +00003907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908
3909 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003910 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003912 return NULL;
3913
3914 /* The normal path */
3915 if (PyBytes_Check(v))
3916 return v;
3917
3918 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003919 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003920 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003921 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003922
3923 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003924 "encoder %s returned bytearray instead of bytes; "
3925 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003926 encoding);
3927 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003928 Py_DECREF(v);
3929 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003930 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003931
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003932 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3933 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003934 Py_DECREF(v);
3935 return b;
3936 }
3937
3938 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003939 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003940 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003941 encoding,
3942 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003943 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003944 return NULL;
3945}
3946
Alexander Belopolsky40018472011-02-26 01:02:56 +00003947PyObject *
3948PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003949 const char *encoding,
3950 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003951{
3952 PyObject *v;
3953
3954 if (!PyUnicode_Check(unicode)) {
3955 PyErr_BadArgument();
3956 goto onError;
3957 }
3958
Serhiy Storchaka00939072016-10-27 21:05:49 +03003959 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3960 "PyUnicode_AsEncodedUnicode() is deprecated; "
3961 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3962 return NULL;
3963
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003964 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003966
3967 /* Encode via the codec registry */
3968 v = PyCodec_Encode(unicode, encoding, errors);
3969 if (v == NULL)
3970 goto onError;
3971 if (!PyUnicode_Check(v)) {
3972 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003973 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003974 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003975 encoding,
3976 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003977 Py_DECREF(v);
3978 goto onError;
3979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003981
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 return NULL;
3984}
3985
Victor Stinner2cba6b82018-01-10 22:46:15 +01003986static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003987unicode_decode_locale(const char *str, Py_ssize_t len,
3988 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003989{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003990 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3991 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003992 return NULL;
3993 }
3994
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003995 wchar_t *wstr;
3996 size_t wlen;
3997 const char *reason;
3998 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003999 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004000 if (res != 0) {
4001 if (res == -2) {
4002 PyObject *exc;
4003 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4004 "locale", str, len,
4005 (Py_ssize_t)wlen,
4006 (Py_ssize_t)(wlen + 1),
4007 reason);
4008 if (exc != NULL) {
4009 PyCodec_StrictErrors(exc);
4010 Py_DECREF(exc);
4011 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01004012 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02004013 else if (res == -3) {
4014 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4015 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01004016 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004017 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01004018 }
Victor Stinner2f197072011-12-17 07:08:30 +01004019 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01004020 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004021
4022 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4023 PyMem_RawFree(wstr);
4024 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004025}
4026
4027PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01004028PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4029 const char *errors)
4030{
Victor Stinner709d23d2019-05-02 14:56:30 -04004031 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4032 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01004033}
4034
4035PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01004036PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004037{
4038 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04004039 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4040 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004041}
4042
4043
4044PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00004045PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004046 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004047 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4048}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004049
Christian Heimes5894ba72007-11-04 11:43:14 +00004050PyObject*
4051PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4052{
Victor Stinner81a7be32020-04-14 15:14:01 +02004053 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004054 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4055 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004056 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004057 fs_codec->error_handler,
4058 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004059 NULL);
4060 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004061#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004062 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004063 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004064 fs_codec->encoding,
4065 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004066 }
Victor Stinnerad158722010-10-27 00:25:46 +00004067#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004068 else {
4069 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4070 machinery is not ready and so cannot be used:
4071 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004072 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4073 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004074 assert(filesystem_errors != NULL);
4075 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4076 assert(errors != _Py_ERROR_UNKNOWN);
4077#ifdef _Py_FORCE_UTF8_FS_ENCODING
4078 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4079#else
4080 return unicode_decode_locale(s, size, errors, 0);
4081#endif
4082 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004083}
4084
Martin v. Löwis011e8422009-05-05 04:43:17 +00004085
4086int
4087PyUnicode_FSConverter(PyObject* arg, void* addr)
4088{
Brett Cannonec6ce872016-09-06 15:50:29 -07004089 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004090 PyObject *output = NULL;
4091 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004092 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004093 if (arg == NULL) {
4094 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004095 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004096 return 1;
4097 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004098 path = PyOS_FSPath(arg);
4099 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004100 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004101 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004102 if (PyBytes_Check(path)) {
4103 output = path;
4104 }
4105 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4106 output = PyUnicode_EncodeFSDefault(path);
4107 Py_DECREF(path);
4108 if (!output) {
4109 return 0;
4110 }
4111 assert(PyBytes_Check(output));
4112 }
4113
Victor Stinner0ea2a462010-04-30 00:22:08 +00004114 size = PyBytes_GET_SIZE(output);
4115 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004116 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004117 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004118 Py_DECREF(output);
4119 return 0;
4120 }
4121 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004122 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004123}
4124
4125
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004126int
4127PyUnicode_FSDecoder(PyObject* arg, void* addr)
4128{
Brett Cannona5711202016-09-06 19:36:01 -07004129 int is_buffer = 0;
4130 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004131 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004132 if (arg == NULL) {
4133 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004134 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004135 return 1;
4136 }
Brett Cannona5711202016-09-06 19:36:01 -07004137
4138 is_buffer = PyObject_CheckBuffer(arg);
4139 if (!is_buffer) {
4140 path = PyOS_FSPath(arg);
4141 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004142 return 0;
4143 }
Brett Cannona5711202016-09-06 19:36:01 -07004144 }
4145 else {
4146 path = arg;
4147 Py_INCREF(arg);
4148 }
4149
4150 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004151 output = path;
4152 }
4153 else if (PyBytes_Check(path) || is_buffer) {
4154 PyObject *path_bytes = NULL;
4155
4156 if (!PyBytes_Check(path) &&
4157 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004158 "path should be string, bytes, or os.PathLike, not %.200s",
4159 Py_TYPE(arg)->tp_name)) {
4160 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004161 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004162 }
4163 path_bytes = PyBytes_FromObject(path);
4164 Py_DECREF(path);
4165 if (!path_bytes) {
4166 return 0;
4167 }
4168 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4169 PyBytes_GET_SIZE(path_bytes));
4170 Py_DECREF(path_bytes);
4171 if (!output) {
4172 return 0;
4173 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004174 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004175 else {
4176 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004177 "path should be string, bytes, or os.PathLike, not %.200s",
4178 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004179 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004180 return 0;
4181 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004182 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004183 Py_DECREF(output);
4184 return 0;
4185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004186 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004187 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004188 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004189 Py_DECREF(output);
4190 return 0;
4191 }
4192 *(PyObject**)addr = output;
4193 return Py_CLEANUP_SUPPORTED;
4194}
4195
4196
Inada Naoki02a4d572020-02-27 13:48:59 +09004197static int unicode_fill_utf8(PyObject *unicode);
4198
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004199const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004200PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004201{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004202 if (!PyUnicode_Check(unicode)) {
4203 PyErr_BadArgument();
4204 return NULL;
4205 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004206 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004207 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004208
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004209 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004210 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004211 return NULL;
4212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004213 }
4214
4215 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004216 *psize = PyUnicode_UTF8_LENGTH(unicode);
4217 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004218}
4219
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004220const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004221PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004222{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004223 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4224}
4225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004226Py_UNICODE *
4227PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4228{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004229 if (!PyUnicode_Check(unicode)) {
4230 PyErr_BadArgument();
4231 return NULL;
4232 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004233 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4234 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004235 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004236 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004237 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238
Serhiy Storchakac46db922018-10-23 22:58:24 +03004239 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4240 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4241 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004243 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01004244 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
Serhiy Storchakac46db922018-10-23 22:58:24 +03004245 if (w == NULL) {
4246 PyErr_NoMemory();
4247 return NULL;
4248 }
4249 unicode_copy_as_widechar(unicode, w, wlen + 1);
4250 _PyUnicode_WSTR(unicode) = w;
4251 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4252 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004253 }
4254 }
4255 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004256 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004257 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004258}
4259
Inada Naoki2c4928d2020-06-17 20:09:44 +09004260/* Deprecated APIs */
4261
4262_Py_COMP_DIAG_PUSH
4263_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4264
Alexander Belopolsky40018472011-02-26 01:02:56 +00004265Py_UNICODE *
4266PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269}
4270
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004271const Py_UNICODE *
4272_PyUnicode_AsUnicode(PyObject *unicode)
4273{
4274 Py_ssize_t size;
4275 const Py_UNICODE *wstr;
4276
4277 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4278 if (wstr && wcslen(wstr) != (size_t)size) {
4279 PyErr_SetString(PyExc_ValueError, "embedded null character");
4280 return NULL;
4281 }
4282 return wstr;
4283}
4284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004285
Alexander Belopolsky40018472011-02-26 01:02:56 +00004286Py_ssize_t
4287PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288{
4289 if (!PyUnicode_Check(unicode)) {
4290 PyErr_BadArgument();
4291 goto onError;
4292 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004293 if (_PyUnicode_WSTR(unicode) == NULL) {
4294 if (PyUnicode_AsUnicode(unicode) == NULL)
4295 goto onError;
4296 }
4297 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 return -1;
4301}
4302
Inada Naoki2c4928d2020-06-17 20:09:44 +09004303_Py_COMP_DIAG_POP
4304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004305Py_ssize_t
4306PyUnicode_GetLength(PyObject *unicode)
4307{
Victor Stinner07621332012-06-16 04:53:46 +02004308 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004309 PyErr_BadArgument();
4310 return -1;
4311 }
Victor Stinner07621332012-06-16 04:53:46 +02004312 if (PyUnicode_READY(unicode) == -1)
4313 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004314 return PyUnicode_GET_LENGTH(unicode);
4315}
4316
4317Py_UCS4
4318PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4319{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004320 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004321 int kind;
4322
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004323 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004324 PyErr_BadArgument();
4325 return (Py_UCS4)-1;
4326 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004327 if (PyUnicode_READY(unicode) == -1) {
4328 return (Py_UCS4)-1;
4329 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004330 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004331 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004332 return (Py_UCS4)-1;
4333 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004334 data = PyUnicode_DATA(unicode);
4335 kind = PyUnicode_KIND(unicode);
4336 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004337}
4338
4339int
4340PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4341{
4342 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004343 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004344 return -1;
4345 }
Victor Stinner488fa492011-12-12 00:01:39 +01004346 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004347 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004348 PyErr_SetString(PyExc_IndexError, "string index out of range");
4349 return -1;
4350 }
Victor Stinner488fa492011-12-12 00:01:39 +01004351 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004352 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004353 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4354 PyErr_SetString(PyExc_ValueError, "character out of range");
4355 return -1;
4356 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004357 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4358 index, ch);
4359 return 0;
4360}
4361
Alexander Belopolsky40018472011-02-26 01:02:56 +00004362const char *
4363PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004364{
Victor Stinner42cb4622010-09-01 19:39:01 +00004365 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004366}
4367
Victor Stinner554f3f02010-06-16 23:33:54 +00004368/* create or adjust a UnicodeDecodeError */
4369static void
4370make_decode_exception(PyObject **exceptionObject,
4371 const char *encoding,
4372 const char *input, Py_ssize_t length,
4373 Py_ssize_t startpos, Py_ssize_t endpos,
4374 const char *reason)
4375{
4376 if (*exceptionObject == NULL) {
4377 *exceptionObject = PyUnicodeDecodeError_Create(
4378 encoding, input, length, startpos, endpos, reason);
4379 }
4380 else {
4381 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4382 goto onError;
4383 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4384 goto onError;
4385 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4386 goto onError;
4387 }
4388 return;
4389
4390onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004391 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004392}
4393
Steve Dowercc16be82016-09-08 10:35:16 -07004394#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004395static int
4396widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4397{
4398 if (newsize > *size) {
4399 wchar_t *newbuf = *buf;
4400 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4401 PyErr_NoMemory();
4402 return -1;
4403 }
4404 *buf = newbuf;
4405 }
4406 *size = newsize;
4407 return 0;
4408}
4409
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410/* error handling callback helper:
4411 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004412 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 and adjust various state variables.
4414 return 0 on success, -1 on error
4415*/
4416
Alexander Belopolsky40018472011-02-26 01:02:56 +00004417static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004418unicode_decode_call_errorhandler_wchar(
4419 const char *errors, PyObject **errorHandler,
4420 const char *encoding, const char *reason,
4421 const char **input, const char **inend, Py_ssize_t *startinpos,
4422 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004423 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004425 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426
4427 PyObject *restuple = NULL;
4428 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004429 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004430 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004431 Py_ssize_t requiredsize;
4432 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004433 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004434 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435
4436 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 *errorHandler = PyCodec_LookupError(errors);
4438 if (*errorHandler == NULL)
4439 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 }
4441
Victor Stinner554f3f02010-06-16 23:33:54 +00004442 make_decode_exception(exceptionObject,
4443 encoding,
4444 *input, *inend - *input,
4445 *startinpos, *endinpos,
4446 reason);
4447 if (*exceptionObject == NULL)
4448 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449
Petr Viktorinffd97532020-02-11 17:46:57 +01004450 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004454 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004455 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004457 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004459
4460 /* Copy back the bytes variables, which might have been modified by the
4461 callback */
4462 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4463 if (!inputobj)
4464 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004465 *input = PyBytes_AS_STRING(inputobj);
4466 insize = PyBytes_GET_SIZE(inputobj);
4467 *inend = *input + insize;
4468 /* we can DECREF safely, as the exception has another reference,
4469 so the object won't go away. */
4470 Py_DECREF(inputobj);
4471
4472 if (newpos<0)
4473 newpos = insize+newpos;
4474 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004475 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004476 goto onError;
4477 }
4478
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004479#if USE_UNICODE_WCHAR_CACHE
4480_Py_COMP_DIAG_PUSH
4481_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4482 repwlen = PyUnicode_GetSize(repunicode);
4483 if (repwlen < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004484 goto onError;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004485_Py_COMP_DIAG_POP
4486#else /* USE_UNICODE_WCHAR_CACHE */
4487 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4488 if (repwlen < 0)
4489 goto onError;
4490 repwlen--;
4491#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004492 /* need more space? (at least enough for what we
4493 have+the replacement+the rest of the string (starting
4494 at the new input position), so we won't have to check space
4495 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004496 requiredsize = *outpos;
4497 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4498 goto overflow;
4499 requiredsize += repwlen;
4500 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4501 goto overflow;
4502 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004503 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004504 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004505 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004506 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004507 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004508 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004509 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004510 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004511 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004512 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004513 *endinpos = newpos;
4514 *inptr = *input + newpos;
4515
4516 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004517 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004518 return 0;
4519
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004520 overflow:
4521 PyErr_SetString(PyExc_OverflowError,
4522 "decoded result is too long for a Python string");
4523
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004524 onError:
4525 Py_XDECREF(restuple);
4526 return -1;
4527}
Steve Dowercc16be82016-09-08 10:35:16 -07004528#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529
4530static int
4531unicode_decode_call_errorhandler_writer(
4532 const char *errors, PyObject **errorHandler,
4533 const char *encoding, const char *reason,
4534 const char **input, const char **inend, Py_ssize_t *startinpos,
4535 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4536 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4537{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004538 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004539
4540 PyObject *restuple = NULL;
4541 PyObject *repunicode = NULL;
4542 Py_ssize_t insize;
4543 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004544 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004545 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004546 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004547 int need_to_grow = 0;
4548 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004549
4550 if (*errorHandler == NULL) {
4551 *errorHandler = PyCodec_LookupError(errors);
4552 if (*errorHandler == NULL)
4553 goto onError;
4554 }
4555
4556 make_decode_exception(exceptionObject,
4557 encoding,
4558 *input, *inend - *input,
4559 *startinpos, *endinpos,
4560 reason);
4561 if (*exceptionObject == NULL)
4562 goto onError;
4563
Petr Viktorinffd97532020-02-11 17:46:57 +01004564 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004565 if (restuple == NULL)
4566 goto onError;
4567 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004568 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004569 goto onError;
4570 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004571 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004572 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004573
4574 /* Copy back the bytes variables, which might have been modified by the
4575 callback */
4576 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4577 if (!inputobj)
4578 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004579 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004580 *input = PyBytes_AS_STRING(inputobj);
4581 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004582 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004583 /* we can DECREF safely, as the exception has another reference,
4584 so the object won't go away. */
4585 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004589 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004590 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004592 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593
Victor Stinner170ca6f2013-04-18 00:25:28 +02004594 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004595 if (replen > 1) {
4596 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004597 need_to_grow = 1;
4598 }
4599 new_inptr = *input + newpos;
4600 if (*inend - new_inptr > remain) {
4601 /* We don't know the decoding algorithm here so we make the worst
4602 assumption that one byte decodes to one unicode character.
4603 If unfortunately one byte could decode to more unicode characters,
4604 the decoder may write out-of-bound then. Is it possible for the
4605 algorithms using this function? */
4606 writer->min_length += *inend - new_inptr - remain;
4607 need_to_grow = 1;
4608 }
4609 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004610 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004611 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004612 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4613 goto onError;
4614 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004615 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004616 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004617
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004619 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004622 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004623 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004627 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628}
4629
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630/* --- UTF-7 Codec -------------------------------------------------------- */
4631
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632/* See RFC2152 for details. We encode conservatively and decode liberally. */
4633
4634/* Three simple macros defining base-64. */
4635
4636/* Is c a base-64 character? */
4637
4638#define IS_BASE64(c) \
4639 (((c) >= 'A' && (c) <= 'Z') || \
4640 ((c) >= 'a' && (c) <= 'z') || \
4641 ((c) >= '0' && (c) <= '9') || \
4642 (c) == '+' || (c) == '/')
4643
4644/* given that c is a base-64 character, what is its base-64 value? */
4645
4646#define FROM_BASE64(c) \
4647 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4648 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4649 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4650 (c) == '+' ? 62 : 63)
4651
4652/* What is the base-64 character of the bottom 6 bits of n? */
4653
4654#define TO_BASE64(n) \
4655 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4656
4657/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4658 * decoded as itself. We are permissive on decoding; the only ASCII
4659 * byte not decoding to itself is the + which begins a base64
4660 * string. */
4661
4662#define DECODE_DIRECT(c) \
4663 ((c) <= 127 && (c) != '+')
4664
4665/* The UTF-7 encoder treats ASCII characters differently according to
4666 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4667 * the above). See RFC2152. This array identifies these different
4668 * sets:
4669 * 0 : "Set D"
4670 * alphanumeric and '(),-./:?
4671 * 1 : "Set O"
4672 * !"#$%&*;<=>@[]^_`{|}
4673 * 2 : "whitespace"
4674 * ht nl cr sp
4675 * 3 : special (must be base64 encoded)
4676 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4677 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004678
Tim Petersced69f82003-09-16 20:30:58 +00004679static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004680char utf7_category[128] = {
4681/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4682 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4683/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4684 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4685/* sp ! " # $ % & ' ( ) * + , - . / */
4686 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4687/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4689/* @ A B C D E F G H I J K L M N O */
4690 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4691/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4692 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4693/* ` a b c d e f g h i j k l m n o */
4694 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4695/* p q r s t u v w x y z { | } ~ del */
4696 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004697};
4698
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699/* ENCODE_DIRECT: this character should be encoded as itself. The
4700 * answer depends on whether we are encoding set O as itself, and also
4701 * on whether we are encoding whitespace as itself. RFC2152 makes it
4702 * clear that the answers to these questions vary between
4703 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004704
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705#define ENCODE_DIRECT(c, directO, directWS) \
4706 ((c) < 128 && (c) > 0 && \
4707 ((utf7_category[(c)] == 0) || \
4708 (directWS && (utf7_category[(c)] == 2)) || \
4709 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004710
Alexander Belopolsky40018472011-02-26 01:02:56 +00004711PyObject *
4712PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004713 Py_ssize_t size,
4714 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004716 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4717}
4718
Antoine Pitrou244651a2009-05-04 18:56:13 +00004719/* The decoder. The only state we preserve is our read position,
4720 * i.e. how many characters we have consumed. So if we end in the
4721 * middle of a shift sequence we have to back off the read position
4722 * and the output to the beginning of the sequence, otherwise we lose
4723 * all the shift state (seen bits, number of bits seen, high
4724 * surrogate). */
4725
Alexander Belopolsky40018472011-02-26 01:02:56 +00004726PyObject *
4727PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004728 Py_ssize_t size,
4729 const char *errors,
4730 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004731{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004733 Py_ssize_t startinpos;
4734 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004736 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004737 const char *errmsg = "";
4738 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004739 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004740 unsigned int base64bits = 0;
4741 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004742 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 PyObject *errorHandler = NULL;
4744 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004746 if (size == 0) {
4747 if (consumed)
4748 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004749 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004750 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004752 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004753 _PyUnicodeWriter_Init(&writer);
4754 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755
4756 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004757 e = s + size;
4758
4759 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004760 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004762 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004763
Antoine Pitrou244651a2009-05-04 18:56:13 +00004764 if (inShift) { /* in a base-64 section */
4765 if (IS_BASE64(ch)) { /* consume a base-64 character */
4766 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4767 base64bits += 6;
4768 s++;
4769 if (base64bits >= 16) {
4770 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004771 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 base64bits -= 16;
4773 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004774 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 if (surrogate) {
4776 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004777 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4778 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004779 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004780 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004782 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004783 }
4784 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004785 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004786 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004787 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 }
4789 }
Victor Stinner551ac952011-11-29 22:58:13 +01004790 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 /* first surrogate */
4792 surrogate = outCh;
4793 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004795 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004796 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 }
4798 }
4799 }
4800 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004801 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004802 if (base64bits > 0) { /* left-over bits */
4803 if (base64bits >= 6) {
4804 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004805 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004806 errmsg = "partial character in shift sequence";
4807 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004808 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004809 else {
4810 /* Some bits remain; they should be zero */
4811 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004812 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 errmsg = "non-zero padding bits in shift sequence";
4814 goto utf7Error;
4815 }
4816 }
4817 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004818 if (surrogate && DECODE_DIRECT(ch)) {
4819 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4820 goto onError;
4821 }
4822 surrogate = 0;
4823 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004824 /* '-' is absorbed; other terminating
4825 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004826 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004827 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004828 }
4829 }
4830 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004831 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004832 s++; /* consume '+' */
4833 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004834 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004835 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004836 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004837 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004838 else if (s < e && !IS_BASE64(*s)) {
4839 s++;
4840 errmsg = "ill-formed sequence";
4841 goto utf7Error;
4842 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004843 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004844 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004845 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004846 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004847 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004848 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004849 }
4850 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004851 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004852 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004853 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004854 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004855 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004856 else {
4857 startinpos = s-starts;
4858 s++;
4859 errmsg = "unexpected special character";
4860 goto utf7Error;
4861 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004862 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004863utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004865 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 errors, &errorHandler,
4867 "utf7", errmsg,
4868 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004869 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004871 }
4872
Antoine Pitrou244651a2009-05-04 18:56:13 +00004873 /* end of string */
4874
4875 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4876 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004877 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004878 if (surrogate ||
4879 (base64bits >= 6) ||
4880 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004881 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004882 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004883 errors, &errorHandler,
4884 "utf7", "unterminated shift sequence",
4885 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004886 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004887 goto onError;
4888 if (s < e)
4889 goto restart;
4890 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004891 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004892
4893 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004894 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004895 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004896 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004897 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004898 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004899 writer.kind, writer.data, shiftOutStart);
4900 Py_XDECREF(errorHandler);
4901 Py_XDECREF(exc);
4902 _PyUnicodeWriter_Dealloc(&writer);
4903 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004904 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004905 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004906 }
4907 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004908 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004909 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004910 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004911
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004912 Py_XDECREF(errorHandler);
4913 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004914 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004915
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004917 Py_XDECREF(errorHandler);
4918 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004919 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004920 return NULL;
4921}
4922
4923
Alexander Belopolsky40018472011-02-26 01:02:56 +00004924PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004925_PyUnicode_EncodeUTF7(PyObject *str,
4926 int base64SetO,
4927 int base64WhiteSpace,
4928 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004929{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004930 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004931 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004932 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004933 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004934 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004935 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004936 unsigned int base64bits = 0;
4937 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004938 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004939 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004940
Benjamin Petersonbac79492012-01-14 13:34:47 -05004941 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004942 return NULL;
4943 kind = PyUnicode_KIND(str);
4944 data = PyUnicode_DATA(str);
4945 len = PyUnicode_GET_LENGTH(str);
4946
4947 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004949
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004950 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004951 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004952 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004953 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004954 if (v == NULL)
4955 return NULL;
4956
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004957 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004958 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004959 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004960
Antoine Pitrou244651a2009-05-04 18:56:13 +00004961 if (inShift) {
4962 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4963 /* shifting out */
4964 if (base64bits) { /* output remaining bits */
4965 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4966 base64buffer = 0;
4967 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004968 }
4969 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004970 /* Characters not in the BASE64 set implicitly unshift the sequence
4971 so no '-' is required, except if the character is itself a '-' */
4972 if (IS_BASE64(ch) || ch == '-') {
4973 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004974 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004975 *out++ = (char) ch;
4976 }
4977 else {
4978 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004979 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004980 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004981 else { /* not in a shift sequence */
4982 if (ch == '+') {
4983 *out++ = '+';
4984 *out++ = '-';
4985 }
4986 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4987 *out++ = (char) ch;
4988 }
4989 else {
4990 *out++ = '+';
4991 inShift = 1;
4992 goto encode_char;
4993 }
4994 }
4995 continue;
4996encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004997 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004998 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004999
Antoine Pitrou244651a2009-05-04 18:56:13 +00005000 /* code first surrogate */
5001 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01005002 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00005003 while (base64bits >= 6) {
5004 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5005 base64bits -= 6;
5006 }
5007 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01005008 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00005009 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005010 base64bits += 16;
5011 base64buffer = (base64buffer << 16) | ch;
5012 while (base64bits >= 6) {
5013 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5014 base64bits -= 6;
5015 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00005016 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005017 if (base64bits)
5018 *out++= TO_BASE64(base64buffer << (6-base64bits) );
5019 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005020 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005021 if (_PyBytes_Resize(&v, out - start) < 0)
5022 return NULL;
5023 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005024}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005025PyObject *
5026PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5027 Py_ssize_t size,
5028 int base64SetO,
5029 int base64WhiteSpace,
5030 const char *errors)
5031{
5032 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005033 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005034 if (tmp == NULL)
5035 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01005036 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005037 base64WhiteSpace, errors);
5038 Py_DECREF(tmp);
5039 return result;
5040}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005041
Antoine Pitrou244651a2009-05-04 18:56:13 +00005042#undef IS_BASE64
5043#undef FROM_BASE64
5044#undef TO_BASE64
5045#undef DECODE_DIRECT
5046#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005047
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048/* --- UTF-8 Codec -------------------------------------------------------- */
5049
Alexander Belopolsky40018472011-02-26 01:02:56 +00005050PyObject *
5051PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005052 Py_ssize_t size,
5053 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054{
Walter Dörwald69652032004-09-07 20:24:22 +00005055 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5056}
5057
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005058#include "stringlib/asciilib.h"
5059#include "stringlib/codecs.h"
5060#include "stringlib/undef.h"
5061
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005062#include "stringlib/ucs1lib.h"
5063#include "stringlib/codecs.h"
5064#include "stringlib/undef.h"
5065
5066#include "stringlib/ucs2lib.h"
5067#include "stringlib/codecs.h"
5068#include "stringlib/undef.h"
5069
5070#include "stringlib/ucs4lib.h"
5071#include "stringlib/codecs.h"
5072#include "stringlib/undef.h"
5073
Ma Lina0c603c2020-10-18 22:48:38 +08005074/* Mask to quickly check whether a C 'size_t' contains a
Antoine Pitrouab868312009-01-10 15:40:25 +00005075 non-ASCII, UTF8-encoded char. */
Ma Lina0c603c2020-10-18 22:48:38 +08005076#if (SIZEOF_SIZE_T == 8)
5077# define ASCII_CHAR_MASK 0x8080808080808080ULL
5078#elif (SIZEOF_SIZE_T == 4)
5079# define ASCII_CHAR_MASK 0x80808080U
Antoine Pitrouab868312009-01-10 15:40:25 +00005080#else
Ma Lina0c603c2020-10-18 22:48:38 +08005081# error C 'size_t' size should be either 4 or 8!
Antoine Pitrouab868312009-01-10 15:40:25 +00005082#endif
5083
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084static Py_ssize_t
5085ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005086{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005087 const char *p = start;
Ma Lina0c603c2020-10-18 22:48:38 +08005088 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_SIZE_T);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005089
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005090 /*
5091 * Issue #17237: m68k is a bit different from most architectures in
5092 * that objects do not use "natural alignment" - for example, int and
5093 * long are only aligned at 2-byte boundaries. Therefore the assert()
5094 * won't work; also, tests have shown that skipping the "optimised
5095 * version" will even speed up m68k.
5096 */
5097#if !defined(__m68k__)
Ma Lina0c603c2020-10-18 22:48:38 +08005098#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5099 assert(_Py_IS_ALIGNED(dest, SIZEOF_SIZE_T));
5100 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005101 /* Fast path, see in STRINGLIB(utf8_decode) for
5102 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005103 /* Help allocation */
5104 const char *_p = p;
5105 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005106 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005107 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005108 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005110 *((size_t *)q) = value;
5111 _p += SIZEOF_SIZE_T;
5112 q += SIZEOF_SIZE_T;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005113 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005114 p = _p;
5115 while (p < end) {
5116 if ((unsigned char)*p & 0x80)
5117 break;
5118 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005120 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005122#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005123#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 while (p < end) {
5125 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5126 for an explanation. */
Ma Lina0c603c2020-10-18 22:48:38 +08005127 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005128 /* Help allocation */
5129 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005131 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005132 if (value & ASCII_CHAR_MASK)
5133 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005134 _p += SIZEOF_SIZE_T;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005135 }
5136 p = _p;
5137 if (_p == end)
5138 break;
5139 }
5140 if ((unsigned char)*p & 0x80)
5141 break;
5142 ++p;
5143 }
5144 memcpy(dest, start, p - start);
5145 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146}
Antoine Pitrouab868312009-01-10 15:40:25 +00005147
Victor Stinner709d23d2019-05-02 14:56:30 -04005148static PyObject *
5149unicode_decode_utf8(const char *s, Py_ssize_t size,
5150 _Py_error_handler error_handler, const char *errors,
5151 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005152{
Victor Stinner785938e2011-12-11 20:09:03 +01005153 if (size == 0) {
5154 if (consumed)
5155 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005156 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005157 }
5158
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005159 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5160 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005161 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005162 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005163 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005164 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005165 }
5166
Inada Naoki770847a2019-06-24 12:30:24 +09005167 const char *starts = s;
5168 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005169
Inada Naoki770847a2019-06-24 12:30:24 +09005170 // fast path: try ASCII string.
5171 PyObject *u = PyUnicode_New(size, 127);
5172 if (u == NULL) {
5173 return NULL;
5174 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005175 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005176 if (s == end) {
5177 return u;
5178 }
5179
5180 // Use _PyUnicodeWriter after fast path is failed.
5181 _PyUnicodeWriter writer;
5182 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5183 writer.pos = s - starts;
5184
5185 Py_ssize_t startinpos, endinpos;
5186 const char *errmsg = "";
5187 PyObject *error_handler_obj = NULL;
5188 PyObject *exc = NULL;
5189
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005190 while (s < end) {
5191 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005192 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005193
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005194 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005195 if (PyUnicode_IS_ASCII(writer.buffer))
5196 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005197 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005198 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005199 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005200 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005201 } else {
5202 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005203 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005204 }
5205
5206 switch (ch) {
5207 case 0:
5208 if (s == end || consumed)
5209 goto End;
5210 errmsg = "unexpected end of data";
5211 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005212 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005213 break;
5214 case 1:
5215 errmsg = "invalid start byte";
5216 startinpos = s - starts;
5217 endinpos = startinpos + 1;
5218 break;
5219 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005220 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5221 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5222 {
5223 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005224 goto End;
5225 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005226 /* fall through */
5227 case 3:
5228 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005229 errmsg = "invalid continuation byte";
5230 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005231 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005232 break;
5233 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005234 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005235 goto onError;
5236 continue;
5237 }
5238
Victor Stinner1d65d912015-10-05 13:43:50 +02005239 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005240 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005241
5242 switch (error_handler) {
5243 case _Py_ERROR_IGNORE:
5244 s += (endinpos - startinpos);
5245 break;
5246
5247 case _Py_ERROR_REPLACE:
5248 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5249 goto onError;
5250 s += (endinpos - startinpos);
5251 break;
5252
5253 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005254 {
5255 Py_ssize_t i;
5256
Victor Stinner1d65d912015-10-05 13:43:50 +02005257 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5258 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005259 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005260 ch = (Py_UCS4)(unsigned char)(starts[i]);
5261 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5262 ch + 0xdc00);
5263 writer.pos++;
5264 }
5265 s += (endinpos - startinpos);
5266 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005267 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005268
5269 default:
5270 if (unicode_decode_call_errorhandler_writer(
5271 errors, &error_handler_obj,
5272 "utf-8", errmsg,
5273 &starts, &end, &startinpos, &endinpos, &exc, &s,
5274 &writer))
5275 goto onError;
5276 }
Victor Stinner785938e2011-12-11 20:09:03 +01005277 }
5278
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005279End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005280 if (consumed)
5281 *consumed = s - starts;
5282
Victor Stinner1d65d912015-10-05 13:43:50 +02005283 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005284 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005285 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005286
5287onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005288 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005289 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005290 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005291 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005292}
5293
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005294
Victor Stinner709d23d2019-05-02 14:56:30 -04005295PyObject *
5296PyUnicode_DecodeUTF8Stateful(const char *s,
5297 Py_ssize_t size,
5298 const char *errors,
5299 Py_ssize_t *consumed)
5300{
5301 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5302}
5303
5304
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005305/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5306 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005307
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005308 On success, write a pointer to a newly allocated wide character string into
5309 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5310 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005311
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005312 On memory allocation failure, return -1.
5313
5314 On decoding error (if surrogateescape is zero), return -2. If wlen is
5315 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5316 is not NULL, write the decoding error message into *reason. */
5317int
5318_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005319 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005320{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005321 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005322 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005323 wchar_t *unicode;
5324 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005325
Victor Stinner3d4226a2018-08-29 22:21:32 +02005326 int surrogateescape = 0;
5327 int surrogatepass = 0;
5328 switch (errors)
5329 {
5330 case _Py_ERROR_STRICT:
5331 break;
5332 case _Py_ERROR_SURROGATEESCAPE:
5333 surrogateescape = 1;
5334 break;
5335 case _Py_ERROR_SURROGATEPASS:
5336 surrogatepass = 1;
5337 break;
5338 default:
5339 return -3;
5340 }
5341
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005342 /* Note: size will always be longer than the resulting Unicode
5343 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005344 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005345 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005346 }
5347
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005348 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005349 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005350 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005351 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005352
5353 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005354 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005355 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005356 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005357 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005358#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005359 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005360#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005361 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005362#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005363 if (ch > 0xFF) {
5364#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005365 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005366#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005367 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005368 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005369 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5370 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5371#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005372 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005373 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005374 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005375 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005376 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005377
5378 if (surrogateescape) {
5379 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5380 }
5381 else {
5382 /* Is it a valid three-byte code? */
5383 if (surrogatepass
5384 && (e - s) >= 3
5385 && (s[0] & 0xf0) == 0xe0
5386 && (s[1] & 0xc0) == 0x80
5387 && (s[2] & 0xc0) == 0x80)
5388 {
5389 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5390 s += 3;
5391 unicode[outpos++] = ch;
5392 }
5393 else {
5394 PyMem_RawFree(unicode );
5395 if (reason != NULL) {
5396 switch (ch) {
5397 case 0:
5398 *reason = "unexpected end of data";
5399 break;
5400 case 1:
5401 *reason = "invalid start byte";
5402 break;
5403 /* 2, 3, 4 */
5404 default:
5405 *reason = "invalid continuation byte";
5406 break;
5407 }
5408 }
5409 if (wlen != NULL) {
5410 *wlen = s - orig_s;
5411 }
5412 return -2;
5413 }
5414 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005415 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005416 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005417 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005418 if (wlen) {
5419 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005420 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005421 *wstr = unicode;
5422 return 0;
5423}
5424
Victor Stinner5f9cf232019-03-19 01:46:25 +01005425
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005426wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005427_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5428 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005429{
5430 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005431 int res = _Py_DecodeUTF8Ex(arg, arglen,
5432 &wstr, wlen,
5433 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005434 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005435 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5436 assert(res != -3);
5437 if (wlen) {
5438 *wlen = (size_t)res;
5439 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005440 return NULL;
5441 }
5442 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005443}
5444
Antoine Pitrouab868312009-01-10 15:40:25 +00005445
Victor Stinnere47e6982017-12-21 15:45:16 +01005446/* UTF-8 encoder using the surrogateescape error handler .
5447
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005448 On success, return 0 and write the newly allocated character string (use
5449 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005450
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005451 On encoding failure, return -2 and write the position of the invalid
5452 surrogate character into *error_pos (if error_pos is set) and the decoding
5453 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005454
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005455 On memory allocation failure, return -1. */
5456int
5457_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005458 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005459{
5460 const Py_ssize_t max_char_size = 4;
5461 Py_ssize_t len = wcslen(text);
5462
5463 assert(len >= 0);
5464
Victor Stinner3d4226a2018-08-29 22:21:32 +02005465 int surrogateescape = 0;
5466 int surrogatepass = 0;
5467 switch (errors)
5468 {
5469 case _Py_ERROR_STRICT:
5470 break;
5471 case _Py_ERROR_SURROGATEESCAPE:
5472 surrogateescape = 1;
5473 break;
5474 case _Py_ERROR_SURROGATEPASS:
5475 surrogatepass = 1;
5476 break;
5477 default:
5478 return -3;
5479 }
5480
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005481 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5482 return -1;
5483 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005484 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005485 if (raw_malloc) {
5486 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005487 }
5488 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005489 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005490 }
5491 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005492 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005493 }
5494
5495 char *p = bytes;
5496 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005497 for (i = 0; i < len; ) {
5498 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005499 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005500 i++;
5501#if Py_UNICODE_SIZE == 2
5502 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5503 && i < len
5504 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5505 {
5506 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5507 i++;
5508 }
5509#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005510
5511 if (ch < 0x80) {
5512 /* Encode ASCII */
5513 *p++ = (char) ch;
5514
5515 }
5516 else if (ch < 0x0800) {
5517 /* Encode Latin-1 */
5518 *p++ = (char)(0xc0 | (ch >> 6));
5519 *p++ = (char)(0x80 | (ch & 0x3f));
5520 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005521 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005522 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005523 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005524 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005525 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005526 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005527 if (reason != NULL) {
5528 *reason = "encoding error";
5529 }
5530 if (raw_malloc) {
5531 PyMem_RawFree(bytes);
5532 }
5533 else {
5534 PyMem_Free(bytes);
5535 }
5536 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005537 }
5538 *p++ = (char)(ch & 0xff);
5539 }
5540 else if (ch < 0x10000) {
5541 *p++ = (char)(0xe0 | (ch >> 12));
5542 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5543 *p++ = (char)(0x80 | (ch & 0x3f));
5544 }
5545 else { /* ch >= 0x10000 */
5546 assert(ch <= MAX_UNICODE);
5547 /* Encode UCS4 Unicode ordinals */
5548 *p++ = (char)(0xf0 | (ch >> 18));
5549 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5550 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5551 *p++ = (char)(0x80 | (ch & 0x3f));
5552 }
5553 }
5554 *p++ = '\0';
5555
5556 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005557 char *bytes2;
5558 if (raw_malloc) {
5559 bytes2 = PyMem_RawRealloc(bytes, final_size);
5560 }
5561 else {
5562 bytes2 = PyMem_Realloc(bytes, final_size);
5563 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005564 if (bytes2 == NULL) {
5565 if (error_pos != NULL) {
5566 *error_pos = (size_t)-1;
5567 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005568 if (raw_malloc) {
5569 PyMem_RawFree(bytes);
5570 }
5571 else {
5572 PyMem_Free(bytes);
5573 }
5574 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005575 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005576 *str = bytes2;
5577 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005578}
5579
5580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005581/* Primary internal function which creates utf8 encoded bytes objects.
5582
5583 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005584 and allocate exactly as much space needed at the end. Else allocate the
5585 maximum possible needed (4 result bytes per Unicode character), and return
5586 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005587*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005588static PyObject *
5589unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5590 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005592 if (!PyUnicode_Check(unicode)) {
5593 PyErr_BadArgument();
5594 return NULL;
5595 }
5596
5597 if (PyUnicode_READY(unicode) == -1)
5598 return NULL;
5599
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005600 if (PyUnicode_UTF8(unicode))
5601 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5602 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005603
Inada Naoki02a4d572020-02-27 13:48:59 +09005604 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005605 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005606 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5607
5608 _PyBytesWriter writer;
5609 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005610
Benjamin Petersonead6b532011-12-20 17:23:42 -06005611 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005612 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005613 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005614 case PyUnicode_1BYTE_KIND:
5615 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5616 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005617 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5618 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005619 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005620 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5621 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005622 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005623 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5624 break;
Tim Peters602f7402002-04-27 18:03:26 +00005625 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005626
5627 if (end == NULL) {
5628 _PyBytesWriter_Dealloc(&writer);
5629 return NULL;
5630 }
5631 return _PyBytesWriter_Finish(&writer, end);
5632}
5633
5634static int
5635unicode_fill_utf8(PyObject *unicode)
5636{
5637 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5638 assert(!PyUnicode_IS_ASCII(unicode));
5639
5640 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005641 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005642 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5643
5644 _PyBytesWriter writer;
5645 char *end;
5646
5647 switch (kind) {
5648 default:
5649 Py_UNREACHABLE();
5650 case PyUnicode_1BYTE_KIND:
5651 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5652 _Py_ERROR_STRICT, NULL);
5653 break;
5654 case PyUnicode_2BYTE_KIND:
5655 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5656 _Py_ERROR_STRICT, NULL);
5657 break;
5658 case PyUnicode_4BYTE_KIND:
5659 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5660 _Py_ERROR_STRICT, NULL);
5661 break;
5662 }
5663 if (end == NULL) {
5664 _PyBytesWriter_Dealloc(&writer);
5665 return -1;
5666 }
5667
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005668 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005669 PyBytes_AS_STRING(writer.buffer);
5670 Py_ssize_t len = end - start;
5671
Victor Stinner32bd68c2020-12-01 10:37:39 +01005672 char *cache = PyObject_Malloc(len + 1);
Inada Naoki02a4d572020-02-27 13:48:59 +09005673 if (cache == NULL) {
5674 _PyBytesWriter_Dealloc(&writer);
5675 PyErr_NoMemory();
5676 return -1;
5677 }
5678 _PyUnicode_UTF8(unicode) = cache;
5679 _PyUnicode_UTF8_LENGTH(unicode) = len;
5680 memcpy(cache, start, len);
5681 cache[len] = '\0';
5682 _PyBytesWriter_Dealloc(&writer);
5683 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684}
5685
Alexander Belopolsky40018472011-02-26 01:02:56 +00005686PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005687_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5688{
5689 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5690}
5691
5692
5693PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005694PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5695 Py_ssize_t size,
5696 const char *errors)
5697{
5698 PyObject *v, *unicode;
5699
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005700 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005701 if (unicode == NULL)
5702 return NULL;
5703 v = _PyUnicode_AsUTF8String(unicode, errors);
5704 Py_DECREF(unicode);
5705 return v;
5706}
5707
5708PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005709PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005711 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712}
5713
Walter Dörwald41980ca2007-08-16 21:55:45 +00005714/* --- UTF-32 Codec ------------------------------------------------------- */
5715
5716PyObject *
5717PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 Py_ssize_t size,
5719 const char *errors,
5720 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005721{
5722 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5723}
5724
5725PyObject *
5726PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 Py_ssize_t size,
5728 const char *errors,
5729 int *byteorder,
5730 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005731{
5732 const char *starts = s;
5733 Py_ssize_t startinpos;
5734 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005735 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005736 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005737 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005738 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005739 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005740 PyObject *errorHandler = NULL;
5741 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005742
Andy Lestere6be9b52020-02-11 20:28:35 -06005743 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005744 e = q + size;
5745
5746 if (byteorder)
5747 bo = *byteorder;
5748
5749 /* Check for BOM marks (U+FEFF) in the input and adjust current
5750 byte order setting accordingly. In native mode, the leading BOM
5751 mark is skipped, in all other modes, it is copied to the output
5752 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005753 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005754 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005755 if (bom == 0x0000FEFF) {
5756 bo = -1;
5757 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005758 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005759 else if (bom == 0xFFFE0000) {
5760 bo = 1;
5761 q += 4;
5762 }
5763 if (byteorder)
5764 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005765 }
5766
Victor Stinnere64322e2012-10-30 23:12:47 +01005767 if (q == e) {
5768 if (consumed)
5769 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005770 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005771 }
5772
Victor Stinnere64322e2012-10-30 23:12:47 +01005773#ifdef WORDS_BIGENDIAN
5774 le = bo < 0;
5775#else
5776 le = bo <= 0;
5777#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005778 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005779
Victor Stinner8f674cc2013-04-17 23:02:17 +02005780 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005781 writer.min_length = (e - q + 3) / 4;
5782 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005783 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005784
Victor Stinnere64322e2012-10-30 23:12:47 +01005785 while (1) {
5786 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005787 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005788
Victor Stinnere64322e2012-10-30 23:12:47 +01005789 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005790 enum PyUnicode_Kind kind = writer.kind;
5791 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005792 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005793 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005794 if (le) {
5795 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005796 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005797 if (ch > maxch)
5798 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005799 if (kind != PyUnicode_1BYTE_KIND &&
5800 Py_UNICODE_IS_SURROGATE(ch))
5801 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005803 q += 4;
5804 } while (q <= last);
5805 }
5806 else {
5807 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005808 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005809 if (ch > maxch)
5810 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005811 if (kind != PyUnicode_1BYTE_KIND &&
5812 Py_UNICODE_IS_SURROGATE(ch))
5813 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005814 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005815 q += 4;
5816 } while (q <= last);
5817 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005818 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005819 }
5820
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005821 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005822 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005823 startinpos = ((const char *)q) - starts;
5824 endinpos = startinpos + 4;
5825 }
5826 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005827 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005829 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005831 startinpos = ((const char *)q) - starts;
5832 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005834 else {
5835 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005836 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005837 goto onError;
5838 q += 4;
5839 continue;
5840 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005841 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005842 startinpos = ((const char *)q) - starts;
5843 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005845
5846 /* The remaining input chars are ignored if the callback
5847 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005848 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005850 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005852 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005854 }
5855
Walter Dörwald41980ca2007-08-16 21:55:45 +00005856 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005858
Walter Dörwald41980ca2007-08-16 21:55:45 +00005859 Py_XDECREF(errorHandler);
5860 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005861 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005862
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005864 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005865 Py_XDECREF(errorHandler);
5866 Py_XDECREF(exc);
5867 return NULL;
5868}
5869
5870PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871_PyUnicode_EncodeUTF32(PyObject *str,
5872 const char *errors,
5873 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005874{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005875 enum PyUnicode_Kind kind;
5876 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005877 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005878 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005879 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005880#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005881 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005882#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005883 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005884#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005885 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005886 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005887 PyObject *errorHandler = NULL;
5888 PyObject *exc = NULL;
5889 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005890
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 if (!PyUnicode_Check(str)) {
5892 PyErr_BadArgument();
5893 return NULL;
5894 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005895 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005896 return NULL;
5897 kind = PyUnicode_KIND(str);
5898 data = PyUnicode_DATA(str);
5899 len = PyUnicode_GET_LENGTH(str);
5900
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005901 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005902 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005903 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005904 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005905 if (v == NULL)
5906 return NULL;
5907
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005908 /* output buffer is 4-bytes aligned */
5909 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005910 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005911 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005912 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005913 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005914 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005915
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005916 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005917 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005918 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005919 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005920 else
5921 encoding = "utf-32";
5922
5923 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005924 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5925 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005926 }
5927
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005928 pos = 0;
5929 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005930 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005931
5932 if (kind == PyUnicode_2BYTE_KIND) {
5933 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5934 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005935 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005936 else {
5937 assert(kind == PyUnicode_4BYTE_KIND);
5938 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5939 &out, native_ordering);
5940 }
5941 if (pos == len)
5942 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005943
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005944 rep = unicode_encode_call_errorhandler(
5945 errors, &errorHandler,
5946 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005947 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005948 if (!rep)
5949 goto error;
5950
5951 if (PyBytes_Check(rep)) {
5952 repsize = PyBytes_GET_SIZE(rep);
5953 if (repsize & 3) {
5954 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005955 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005956 "surrogates not allowed");
5957 goto error;
5958 }
5959 moreunits = repsize / 4;
5960 }
5961 else {
5962 assert(PyUnicode_Check(rep));
5963 if (PyUnicode_READY(rep) < 0)
5964 goto error;
5965 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5966 if (!PyUnicode_IS_ASCII(rep)) {
5967 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005968 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005969 "surrogates not allowed");
5970 goto error;
5971 }
5972 }
5973
5974 /* four bytes are reserved for each surrogate */
5975 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005976 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005977 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005978 /* integer overflow */
5979 PyErr_NoMemory();
5980 goto error;
5981 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005982 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005983 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005984 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005985 }
5986
5987 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005988 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005989 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005990 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005991 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005992 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5993 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005994 }
5995
5996 Py_CLEAR(rep);
5997 }
5998
5999 /* Cut back to size actually needed. This is necessary for, for example,
6000 encoding of a string containing isolated surrogates and the 'ignore'
6001 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006002 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006003 if (nsize != PyBytes_GET_SIZE(v))
6004 _PyBytes_Resize(&v, nsize);
6005 Py_XDECREF(errorHandler);
6006 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006007 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006008 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006009 error:
6010 Py_XDECREF(rep);
6011 Py_XDECREF(errorHandler);
6012 Py_XDECREF(exc);
6013 Py_XDECREF(v);
6014 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00006015}
6016
Alexander Belopolsky40018472011-02-26 01:02:56 +00006017PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006018PyUnicode_EncodeUTF32(const Py_UNICODE *s,
6019 Py_ssize_t size,
6020 const char *errors,
6021 int byteorder)
6022{
6023 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006024 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006025 if (tmp == NULL)
6026 return NULL;
6027 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6028 Py_DECREF(tmp);
6029 return result;
6030}
6031
6032PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006033PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00006034{
Victor Stinnerb960b342011-11-20 19:12:52 +01006035 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00006036}
6037
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038/* --- UTF-16 Codec ------------------------------------------------------- */
6039
Tim Peters772747b2001-08-09 22:21:55 +00006040PyObject *
6041PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 Py_ssize_t size,
6043 const char *errors,
6044 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045{
Walter Dörwald69652032004-09-07 20:24:22 +00006046 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6047}
6048
6049PyObject *
6050PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 Py_ssize_t size,
6052 const char *errors,
6053 int *byteorder,
6054 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00006055{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006057 Py_ssize_t startinpos;
6058 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006059 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006060 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006061 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006062 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006063 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 PyObject *errorHandler = NULL;
6065 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006066 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067
Andy Lestere6be9b52020-02-11 20:28:35 -06006068 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006069 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070
6071 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006072 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006074 /* Check for BOM marks (U+FEFF) in the input and adjust current
6075 byte order setting accordingly. In native mode, the leading BOM
6076 mark is skipped, in all other modes, it is copied to the output
6077 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006078 if (bo == 0 && size >= 2) {
6079 const Py_UCS4 bom = (q[1] << 8) | q[0];
6080 if (bom == 0xFEFF) {
6081 q += 2;
6082 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006084 else if (bom == 0xFFFE) {
6085 q += 2;
6086 bo = 1;
6087 }
6088 if (byteorder)
6089 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091
Antoine Pitrou63065d72012-05-15 23:48:04 +02006092 if (q == e) {
6093 if (consumed)
6094 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006095 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006096 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006097
Christian Heimes743e0cd2012-10-17 23:52:17 +02006098#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006099 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006100 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006101#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006102 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006103 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006104#endif
Tim Peters772747b2001-08-09 22:21:55 +00006105
Antoine Pitrou63065d72012-05-15 23:48:04 +02006106 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006107 character count normally. Error handler will take care of
6108 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006109 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006110 writer.min_length = (e - q + 1) / 2;
6111 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006112 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006113
Antoine Pitrou63065d72012-05-15 23:48:04 +02006114 while (1) {
6115 Py_UCS4 ch = 0;
6116 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006117 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006118 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006119 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006120 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006121 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006122 native_ordering);
6123 else
6124 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006125 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006126 native_ordering);
6127 } else if (kind == PyUnicode_2BYTE_KIND) {
6128 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006129 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006130 native_ordering);
6131 } else {
6132 assert(kind == PyUnicode_4BYTE_KIND);
6133 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006134 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006135 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006136 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006137 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138
Antoine Pitrou63065d72012-05-15 23:48:04 +02006139 switch (ch)
6140 {
6141 case 0:
6142 /* remaining byte at the end? (size should be even) */
6143 if (q == e || consumed)
6144 goto End;
6145 errmsg = "truncated data";
6146 startinpos = ((const char *)q) - starts;
6147 endinpos = ((const char *)e) - starts;
6148 break;
6149 /* The remaining input chars are ignored if the callback
6150 chooses to skip the input */
6151 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006152 q -= 2;
6153 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006154 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006155 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006156 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006157 endinpos = ((const char *)e) - starts;
6158 break;
6159 case 2:
6160 errmsg = "illegal encoding";
6161 startinpos = ((const char *)q) - 2 - starts;
6162 endinpos = startinpos + 2;
6163 break;
6164 case 3:
6165 errmsg = "illegal UTF-16 surrogate";
6166 startinpos = ((const char *)q) - 4 - starts;
6167 endinpos = startinpos + 2;
6168 break;
6169 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006170 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006171 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 continue;
6173 }
6174
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006175 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006176 errors,
6177 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006178 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006179 &starts,
6180 (const char **)&e,
6181 &startinpos,
6182 &endinpos,
6183 &exc,
6184 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006185 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 }
6188
Antoine Pitrou63065d72012-05-15 23:48:04 +02006189End:
Walter Dörwald69652032004-09-07 20:24:22 +00006190 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006192
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006193 Py_XDECREF(errorHandler);
6194 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006195 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006198 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 Py_XDECREF(errorHandler);
6200 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 return NULL;
6202}
6203
Tim Peters772747b2001-08-09 22:21:55 +00006204PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006205_PyUnicode_EncodeUTF16(PyObject *str,
6206 const char *errors,
6207 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006209 enum PyUnicode_Kind kind;
6210 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006211 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006212 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006213 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006214 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006215#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006216 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006217#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006218 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006219#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006220 const char *encoding;
6221 Py_ssize_t nsize, pos;
6222 PyObject *errorHandler = NULL;
6223 PyObject *exc = NULL;
6224 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006225
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226 if (!PyUnicode_Check(str)) {
6227 PyErr_BadArgument();
6228 return NULL;
6229 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006230 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006231 return NULL;
6232 kind = PyUnicode_KIND(str);
6233 data = PyUnicode_DATA(str);
6234 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006235
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006236 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006237 if (kind == PyUnicode_4BYTE_KIND) {
6238 const Py_UCS4 *in = (const Py_UCS4 *)data;
6239 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006240 while (in < end) {
6241 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006242 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006243 }
6244 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006245 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006246 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006248 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006249 nsize = len + pairs + (byteorder == 0);
6250 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006251 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006255 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006256 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006257 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006258 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006259 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006260 }
6261 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006262 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006263 }
Tim Peters772747b2001-08-09 22:21:55 +00006264
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006265 if (kind == PyUnicode_1BYTE_KIND) {
6266 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6267 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006268 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006269
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006270 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006271 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006272 }
6273 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006274 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006275 }
6276 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006277 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006278 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006279
6280 pos = 0;
6281 while (pos < len) {
6282 Py_ssize_t repsize, moreunits;
6283
6284 if (kind == PyUnicode_2BYTE_KIND) {
6285 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6286 &out, native_ordering);
6287 }
6288 else {
6289 assert(kind == PyUnicode_4BYTE_KIND);
6290 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6291 &out, native_ordering);
6292 }
6293 if (pos == len)
6294 break;
6295
6296 rep = unicode_encode_call_errorhandler(
6297 errors, &errorHandler,
6298 encoding, "surrogates not allowed",
6299 str, &exc, pos, pos + 1, &pos);
6300 if (!rep)
6301 goto error;
6302
6303 if (PyBytes_Check(rep)) {
6304 repsize = PyBytes_GET_SIZE(rep);
6305 if (repsize & 1) {
6306 raise_encode_exception(&exc, encoding,
6307 str, pos - 1, pos,
6308 "surrogates not allowed");
6309 goto error;
6310 }
6311 moreunits = repsize / 2;
6312 }
6313 else {
6314 assert(PyUnicode_Check(rep));
6315 if (PyUnicode_READY(rep) < 0)
6316 goto error;
6317 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6318 if (!PyUnicode_IS_ASCII(rep)) {
6319 raise_encode_exception(&exc, encoding,
6320 str, pos - 1, pos,
6321 "surrogates not allowed");
6322 goto error;
6323 }
6324 }
6325
6326 /* two bytes are reserved for each surrogate */
6327 if (moreunits > 1) {
6328 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006329 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006330 /* integer overflow */
6331 PyErr_NoMemory();
6332 goto error;
6333 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006334 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006335 goto error;
6336 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6337 }
6338
6339 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006340 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006341 out += moreunits;
6342 } else /* rep is unicode */ {
6343 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6344 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6345 &out, native_ordering);
6346 }
6347
6348 Py_CLEAR(rep);
6349 }
6350
6351 /* Cut back to size actually needed. This is necessary for, for example,
6352 encoding of a string containing isolated surrogates and the 'ignore' handler
6353 is used. */
6354 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6355 if (nsize != PyBytes_GET_SIZE(v))
6356 _PyBytes_Resize(&v, nsize);
6357 Py_XDECREF(errorHandler);
6358 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006359 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006360 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006361 error:
6362 Py_XDECREF(rep);
6363 Py_XDECREF(errorHandler);
6364 Py_XDECREF(exc);
6365 Py_XDECREF(v);
6366 return NULL;
6367#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368}
6369
Alexander Belopolsky40018472011-02-26 01:02:56 +00006370PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006371PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6372 Py_ssize_t size,
6373 const char *errors,
6374 int byteorder)
6375{
6376 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006377 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006378 if (tmp == NULL)
6379 return NULL;
6380 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6381 Py_DECREF(tmp);
6382 return result;
6383}
6384
6385PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006386PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006388 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389}
6390
6391/* --- Unicode Escape Codec ----------------------------------------------- */
6392
Victor Stinner47e1afd2020-10-26 16:43:47 +01006393static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006394
Alexander Belopolsky40018472011-02-26 01:02:56 +00006395PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006396_PyUnicode_DecodeUnicodeEscape(const char *s,
6397 Py_ssize_t size,
6398 const char *errors,
6399 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006402 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 PyObject *errorHandler = NULL;
6405 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006406
Eric V. Smith42454af2016-10-31 09:22:08 -04006407 // so we can remember if we've seen an invalid escape char or not
6408 *first_invalid_escape = NULL;
6409
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006411 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 }
6413 /* Escaped strings will always be longer than the resulting
6414 Unicode string, so we start with size here and then reduce the
6415 length after conversion to the true value.
6416 (but if the error callback returns a long replacement string
6417 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006418 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 writer.min_length = size;
6420 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6421 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006422 }
6423
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 end = s + size;
6425 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 unsigned char c = (unsigned char) *s++;
6427 Py_UCS4 ch;
6428 int count;
6429 Py_ssize_t startinpos;
6430 Py_ssize_t endinpos;
6431 const char *message;
6432
6433#define WRITE_ASCII_CHAR(ch) \
6434 do { \
6435 assert(ch <= 127); \
6436 assert(writer.pos < writer.size); \
6437 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6438 } while(0)
6439
6440#define WRITE_CHAR(ch) \
6441 do { \
6442 if (ch <= writer.maxchar) { \
6443 assert(writer.pos < writer.size); \
6444 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6445 } \
6446 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6447 goto onError; \
6448 } \
6449 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450
6451 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 if (c != '\\') {
6453 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 continue;
6455 }
6456
Victor Stinner62ec3312016-09-06 17:04:34 -07006457 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006459 if (s >= end) {
6460 message = "\\ at end of string";
6461 goto error;
6462 }
6463 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006464
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006466 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 case '\n': continue;
6470 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6471 case '\'': WRITE_ASCII_CHAR('\''); continue;
6472 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6473 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006474 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006475 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6476 case 't': WRITE_ASCII_CHAR('\t'); continue;
6477 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6478 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006479 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006480 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006481 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006482 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 case '0': case '1': case '2': case '3':
6486 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006487 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006488 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006489 ch = (ch<<3) + *s++ - '0';
6490 if (s < end && '0' <= *s && *s <= '7') {
6491 ch = (ch<<3) + *s++ - '0';
6492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006494 WRITE_CHAR(ch);
6495 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 /* hex escapes */
6498 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006500 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006501 message = "truncated \\xXX escape";
6502 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006506 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006507 message = "truncated \\uXXXX escape";
6508 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006511 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006512 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006513 message = "truncated \\UXXXXXXXX escape";
6514 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006515 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006516 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006517 ch <<= 4;
6518 if (c >= '0' && c <= '9') {
6519 ch += c - '0';
6520 }
6521 else if (c >= 'a' && c <= 'f') {
6522 ch += c - ('a' - 10);
6523 }
6524 else if (c >= 'A' && c <= 'F') {
6525 ch += c - ('A' - 10);
6526 }
6527 else {
6528 break;
6529 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006530 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006531 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006532 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006533 }
6534
6535 /* when we get here, ch is a 32-bit unicode character */
6536 if (ch > MAX_UNICODE) {
6537 message = "illegal Unicode character";
6538 goto error;
6539 }
6540
6541 WRITE_CHAR(ch);
6542 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006543
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006545 case 'N':
Victor Stinner47e1afd2020-10-26 16:43:47 +01006546 if (ucnhash_capi == NULL) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006547 /* load the unicode data module */
Victor Stinner47e1afd2020-10-26 16:43:47 +01006548 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006549 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner47e1afd2020-10-26 16:43:47 +01006550 if (ucnhash_capi == NULL) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006551 PyErr_SetString(
6552 PyExc_UnicodeError,
6553 "\\N escapes not supported (can't load unicodedata module)"
6554 );
6555 goto onError;
6556 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006557 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006558
6559 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006560 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006561 const char *start = ++s;
6562 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006563 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006564 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006565 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006566 namelen = s - start;
6567 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006568 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006569 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006570 ch = 0xffffffff; /* in case 'getcode' messes up */
6571 if (namelen <= INT_MAX &&
Victor Stinner920cb642020-10-26 19:19:36 +01006572 ucnhash_capi->getcode(start, (int)namelen,
Victor Stinner62ec3312016-09-06 17:04:34 -07006573 &ch, 0)) {
6574 assert(ch <= MAX_UNICODE);
6575 WRITE_CHAR(ch);
6576 continue;
6577 }
6578 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006579 }
6580 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006581 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006582
6583 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006584 if (*first_invalid_escape == NULL) {
6585 *first_invalid_escape = s-1; /* Back up one char, since we've
6586 already incremented s. */
6587 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006588 WRITE_ASCII_CHAR('\\');
6589 WRITE_CHAR(c);
6590 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006592
6593 error:
6594 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006595 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006596 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006597 errors, &errorHandler,
6598 "unicodeescape", message,
6599 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006600 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006601 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006602 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006603 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006604
6605#undef WRITE_ASCII_CHAR
6606#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006608
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006609 Py_XDECREF(errorHandler);
6610 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006611 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006612
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006614 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006615 Py_XDECREF(errorHandler);
6616 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 return NULL;
6618}
6619
Eric V. Smith42454af2016-10-31 09:22:08 -04006620PyObject *
6621PyUnicode_DecodeUnicodeEscape(const char *s,
6622 Py_ssize_t size,
6623 const char *errors)
6624{
6625 const char *first_invalid_escape;
6626 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6627 &first_invalid_escape);
6628 if (result == NULL)
6629 return NULL;
6630 if (first_invalid_escape != NULL) {
6631 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6632 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006633 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006634 Py_DECREF(result);
6635 return NULL;
6636 }
6637 }
6638 return result;
6639}
6640
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006641/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642
Alexander Belopolsky40018472011-02-26 01:02:56 +00006643PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006646 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006647 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006649 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006650 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006651 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652
Ezio Melottie7f90372012-10-05 03:33:31 +03006653 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006654 escape.
6655
Ezio Melottie7f90372012-10-05 03:33:31 +03006656 For UCS1 strings it's '\xxx', 4 bytes per source character.
6657 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6658 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006659 */
6660
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006661 if (!PyUnicode_Check(unicode)) {
6662 PyErr_BadArgument();
6663 return NULL;
6664 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006665 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006666 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006667 }
Victor Stinner358af132015-10-12 22:36:57 +02006668
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006669 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006670 if (len == 0) {
6671 return PyBytes_FromStringAndSize(NULL, 0);
6672 }
6673
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006674 kind = PyUnicode_KIND(unicode);
6675 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006676 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6677 bytes, and 1 byte characters 4. */
6678 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006679 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006680 return PyErr_NoMemory();
6681 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006682 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006683 if (repr == NULL) {
6684 return NULL;
6685 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006686
Victor Stinner62ec3312016-09-06 17:04:34 -07006687 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006688 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006689 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006690
Victor Stinner62ec3312016-09-06 17:04:34 -07006691 /* U+0000-U+00ff range */
6692 if (ch < 0x100) {
6693 if (ch >= ' ' && ch < 127) {
6694 if (ch != '\\') {
6695 /* Copy printable US ASCII as-is */
6696 *p++ = (char) ch;
6697 }
6698 /* Escape backslashes */
6699 else {
6700 *p++ = '\\';
6701 *p++ = '\\';
6702 }
6703 }
Victor Stinner358af132015-10-12 22:36:57 +02006704
Victor Stinner62ec3312016-09-06 17:04:34 -07006705 /* Map special whitespace to '\t', \n', '\r' */
6706 else if (ch == '\t') {
6707 *p++ = '\\';
6708 *p++ = 't';
6709 }
6710 else if (ch == '\n') {
6711 *p++ = '\\';
6712 *p++ = 'n';
6713 }
6714 else if (ch == '\r') {
6715 *p++ = '\\';
6716 *p++ = 'r';
6717 }
6718
6719 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6720 else {
6721 *p++ = '\\';
6722 *p++ = 'x';
6723 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6724 *p++ = Py_hexdigits[ch & 0x000F];
6725 }
Tim Petersced69f82003-09-16 20:30:58 +00006726 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006727 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006728 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 *p++ = '\\';
6730 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006731 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6732 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6733 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6734 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006736 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6737 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006738
Victor Stinner62ec3312016-09-06 17:04:34 -07006739 /* Make sure that the first two digits are zero */
6740 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006741 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006742 *p++ = 'U';
6743 *p++ = '0';
6744 *p++ = '0';
6745 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6746 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6747 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6748 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6749 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6750 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753
Victor Stinner62ec3312016-09-06 17:04:34 -07006754 assert(p - PyBytes_AS_STRING(repr) > 0);
6755 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6756 return NULL;
6757 }
6758 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759}
6760
Alexander Belopolsky40018472011-02-26 01:02:56 +00006761PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006762PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6763 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006765 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006766 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006767 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006769 }
6770
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006771 result = PyUnicode_AsUnicodeEscapeString(tmp);
6772 Py_DECREF(tmp);
6773 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774}
6775
6776/* --- Raw Unicode Escape Codec ------------------------------------------- */
6777
Alexander Belopolsky40018472011-02-26 01:02:56 +00006778PyObject *
6779PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006780 Py_ssize_t size,
6781 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006783 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006784 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006786 PyObject *errorHandler = NULL;
6787 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006788
Victor Stinner62ec3312016-09-06 17:04:34 -07006789 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006790 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006791 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006792
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793 /* Escaped strings will always be longer than the resulting
6794 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006795 length after conversion to the true value. (But decoding error
6796 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006797 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006798 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006799 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6800 goto onError;
6801 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006802
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 end = s + size;
6804 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006805 unsigned char c = (unsigned char) *s++;
6806 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006807 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006808 Py_ssize_t startinpos;
6809 Py_ssize_t endinpos;
6810 const char *message;
6811
6812#define WRITE_CHAR(ch) \
6813 do { \
6814 if (ch <= writer.maxchar) { \
6815 assert(writer.pos < writer.size); \
6816 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6817 } \
6818 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6819 goto onError; \
6820 } \
6821 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006824 if (c != '\\' || s >= end) {
6825 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006827 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006828
Victor Stinner62ec3312016-09-06 17:04:34 -07006829 c = (unsigned char) *s++;
6830 if (c == 'u') {
6831 count = 4;
6832 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006834 else if (c == 'U') {
6835 count = 8;
6836 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006837 }
6838 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006839 assert(writer.pos < writer.size);
6840 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6841 WRITE_CHAR(c);
6842 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006843 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006844 startinpos = s - starts - 2;
6845
6846 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6847 for (ch = 0; count && s < end; ++s, --count) {
6848 c = (unsigned char)*s;
6849 ch <<= 4;
6850 if (c >= '0' && c <= '9') {
6851 ch += c - '0';
6852 }
6853 else if (c >= 'a' && c <= 'f') {
6854 ch += c - ('a' - 10);
6855 }
6856 else if (c >= 'A' && c <= 'F') {
6857 ch += c - ('A' - 10);
6858 }
6859 else {
6860 break;
6861 }
6862 }
6863 if (!count) {
6864 if (ch <= MAX_UNICODE) {
6865 WRITE_CHAR(ch);
6866 continue;
6867 }
6868 message = "\\Uxxxxxxxx out of range";
6869 }
6870
6871 endinpos = s-starts;
6872 writer.min_length = end - s + writer.pos;
6873 if (unicode_decode_call_errorhandler_writer(
6874 errors, &errorHandler,
6875 "rawunicodeescape", message,
6876 &starts, &end, &startinpos, &endinpos, &exc, &s,
6877 &writer)) {
6878 goto onError;
6879 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006880 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006881
6882#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006884 Py_XDECREF(errorHandler);
6885 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006886 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006887
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006889 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890 Py_XDECREF(errorHandler);
6891 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006893
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894}
6895
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006896
Alexander Belopolsky40018472011-02-26 01:02:56 +00006897PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006898PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899{
Victor Stinner62ec3312016-09-06 17:04:34 -07006900 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006902 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006903 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006904 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006905 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006907 if (!PyUnicode_Check(unicode)) {
6908 PyErr_BadArgument();
6909 return NULL;
6910 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006911 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006912 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006913 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006914 kind = PyUnicode_KIND(unicode);
6915 data = PyUnicode_DATA(unicode);
6916 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006917 if (kind == PyUnicode_1BYTE_KIND) {
6918 return PyBytes_FromStringAndSize(data, len);
6919 }
Victor Stinner0e368262011-11-10 20:12:49 +01006920
Victor Stinner62ec3312016-09-06 17:04:34 -07006921 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6922 bytes, and 1 byte characters 4. */
6923 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006924
Victor Stinner62ec3312016-09-06 17:04:34 -07006925 if (len > PY_SSIZE_T_MAX / expandsize) {
6926 return PyErr_NoMemory();
6927 }
6928 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6929 if (repr == NULL) {
6930 return NULL;
6931 }
6932 if (len == 0) {
6933 return repr;
6934 }
6935
6936 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006937 for (pos = 0; pos < len; pos++) {
6938 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006939
Victor Stinner62ec3312016-09-06 17:04:34 -07006940 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6941 if (ch < 0x100) {
6942 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006943 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006944 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006945 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 *p++ = '\\';
6947 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006948 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6949 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6950 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6951 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006953 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6954 else {
6955 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6956 *p++ = '\\';
6957 *p++ = 'U';
6958 *p++ = '0';
6959 *p++ = '0';
6960 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6961 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6962 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6963 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6964 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6965 *p++ = Py_hexdigits[ch & 15];
6966 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006968
Victor Stinner62ec3312016-09-06 17:04:34 -07006969 assert(p > PyBytes_AS_STRING(repr));
6970 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6971 return NULL;
6972 }
6973 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974}
6975
Alexander Belopolsky40018472011-02-26 01:02:56 +00006976PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006977PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6978 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006980 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006981 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006982 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006983 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006984 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6985 Py_DECREF(tmp);
6986 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987}
6988
6989/* --- Latin-1 Codec ------------------------------------------------------ */
6990
Alexander Belopolsky40018472011-02-26 01:02:56 +00006991PyObject *
6992PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006993 Py_ssize_t size,
6994 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006997 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998}
6999
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007000/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007001static void
7002make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007003 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007004 PyObject *unicode,
7005 Py_ssize_t startpos, Py_ssize_t endpos,
7006 const char *reason)
7007{
7008 if (*exceptionObject == NULL) {
7009 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007010 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01007011 encoding, unicode, startpos, endpos, reason);
7012 }
7013 else {
7014 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7015 goto onError;
7016 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7017 goto onError;
7018 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7019 goto onError;
7020 return;
7021 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02007022 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01007023 }
7024}
7025
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007026/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007027static void
7028raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007029 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007030 PyObject *unicode,
7031 Py_ssize_t startpos, Py_ssize_t endpos,
7032 const char *reason)
7033{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007034 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007035 encoding, unicode, startpos, endpos, reason);
7036 if (*exceptionObject != NULL)
7037 PyCodec_StrictErrors(*exceptionObject);
7038}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007039
7040/* error handling callback helper:
7041 build arguments, call the callback and check the arguments,
7042 put the result into newpos and return the replacement string, which
7043 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007044static PyObject *
7045unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007046 PyObject **errorHandler,
7047 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007048 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007049 Py_ssize_t startpos, Py_ssize_t endpos,
7050 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007051{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02007052 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007053 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007054 PyObject *restuple;
7055 PyObject *resunicode;
7056
7057 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007059 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007061 }
7062
Benjamin Petersonbac79492012-01-14 13:34:47 -05007063 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007064 return NULL;
7065 len = PyUnicode_GET_LENGTH(unicode);
7066
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007067 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007068 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007069 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007071
Petr Viktorinffd97532020-02-11 17:46:57 +01007072 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007073 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007074 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007075 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007076 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 Py_DECREF(restuple);
7078 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007079 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007080 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 &resunicode, newpos)) {
7082 Py_DECREF(restuple);
7083 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007084 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007085 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7086 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7087 Py_DECREF(restuple);
7088 return NULL;
7089 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007090 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007091 *newpos = len + *newpos;
7092 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007093 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 Py_DECREF(restuple);
7095 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007096 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007097 Py_INCREF(resunicode);
7098 Py_DECREF(restuple);
7099 return resunicode;
7100}
7101
Alexander Belopolsky40018472011-02-26 01:02:56 +00007102static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007103unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007104 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007105 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007106{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007107 /* input state */
7108 Py_ssize_t pos=0, size;
7109 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007110 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007111 /* pointer into the output */
7112 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007113 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7114 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007115 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007116 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007117 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007118 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007119 /* output object */
7120 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007121
Benjamin Petersonbac79492012-01-14 13:34:47 -05007122 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007123 return NULL;
7124 size = PyUnicode_GET_LENGTH(unicode);
7125 kind = PyUnicode_KIND(unicode);
7126 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007127 /* allocate enough for a simple encoding without
7128 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007129 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007130 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007131
7132 _PyBytesWriter_Init(&writer);
7133 str = _PyBytesWriter_Alloc(&writer, size);
7134 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007135 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007136
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007137 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007138 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139
Benjamin Peterson29060642009-01-31 22:14:21 +00007140 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007141 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007143 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007144 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007145 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007147 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007149 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007150 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007151 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007152
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007153 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007155
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007156 /* Only overallocate the buffer if it's not the last write */
7157 writer.overallocate = (collend < size);
7158
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007160 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007161 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007162
7163 switch (error_handler) {
7164 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007165 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007167
7168 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007169 memset(str, '?', collend - collstart);
7170 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007171 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007172 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007173 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 break;
Victor Stinner50149202015-09-22 00:26:54 +02007175
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007176 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007177 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007178 writer.min_size -= (collend - collstart);
7179 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007180 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007181 if (str == NULL)
7182 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007183 pos = collend;
7184 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007185
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007186 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007187 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007188 writer.min_size -= (collend - collstart);
7189 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007190 unicode, collstart, collend);
7191 if (str == NULL)
7192 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007193 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 break;
Victor Stinner50149202015-09-22 00:26:54 +02007195
Victor Stinnerc3713e92015-09-29 12:32:13 +02007196 case _Py_ERROR_SURROGATEESCAPE:
7197 for (i = collstart; i < collend; ++i) {
7198 ch = PyUnicode_READ(kind, data, i);
7199 if (ch < 0xdc80 || 0xdcff < ch) {
7200 /* Not a UTF-8b surrogate */
7201 break;
7202 }
7203 *str++ = (char)(ch - 0xdc00);
7204 ++pos;
7205 }
7206 if (i >= collend)
7207 break;
7208 collstart = pos;
7209 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007210 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007211
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007213 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7214 encoding, reason, unicode, &exc,
7215 collstart, collend, &newpos);
7216 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007218
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007219 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007220 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007221
Victor Stinner6bd525b2015-10-09 13:10:05 +02007222 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007223 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007224 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007225 PyBytes_AS_STRING(rep),
7226 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007227 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007228 else {
7229 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007230
Victor Stinner6bd525b2015-10-09 13:10:05 +02007231 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007233
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007234 if (limit == 256 ?
7235 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7236 !PyUnicode_IS_ASCII(rep))
7237 {
7238 /* Not all characters are smaller than limit */
7239 raise_encode_exception(&exc, encoding, unicode,
7240 collstart, collend, reason);
7241 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007243 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7244 str = _PyBytesWriter_WriteBytes(&writer, str,
7245 PyUnicode_DATA(rep),
7246 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007248 if (str == NULL)
7249 goto onError;
7250
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007251 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007252 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007253 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007254
7255 /* If overallocation was disabled, ensure that it was the last
7256 write. Otherwise, we missed an optimization */
7257 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007258 }
7259 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007260
Victor Stinner50149202015-09-22 00:26:54 +02007261 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007263 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007264
7265 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007266 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007267 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007268 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007269 Py_XDECREF(exc);
7270 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007271}
7272
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007273/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007274PyObject *
7275PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007276 Py_ssize_t size,
7277 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007279 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007280 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007281 if (unicode == NULL)
7282 return NULL;
7283 result = unicode_encode_ucs1(unicode, errors, 256);
7284 Py_DECREF(unicode);
7285 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286}
7287
Alexander Belopolsky40018472011-02-26 01:02:56 +00007288PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007289_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290{
7291 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007292 PyErr_BadArgument();
7293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007295 if (PyUnicode_READY(unicode) == -1)
7296 return NULL;
7297 /* Fast path: if it is a one-byte string, construct
7298 bytes object directly. */
7299 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7300 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7301 PyUnicode_GET_LENGTH(unicode));
7302 /* Non-Latin-1 characters present. Defer to above function to
7303 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007304 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007305}
7306
7307PyObject*
7308PyUnicode_AsLatin1String(PyObject *unicode)
7309{
7310 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311}
7312
7313/* --- 7-bit ASCII Codec -------------------------------------------------- */
7314
Alexander Belopolsky40018472011-02-26 01:02:56 +00007315PyObject *
7316PyUnicode_DecodeASCII(const char *s,
7317 Py_ssize_t size,
7318 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007320 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007321 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007322 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007323 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007324 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007325
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007327 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007328
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007330 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007331 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007333
Inada Naoki770847a2019-06-24 12:30:24 +09007334 // Shortcut for simple case
7335 PyObject *u = PyUnicode_New(size, 127);
7336 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007337 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007338 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007339 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007340 if (outpos == size) {
7341 return u;
7342 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007343
Inada Naoki770847a2019-06-24 12:30:24 +09007344 _PyUnicodeWriter writer;
7345 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007346 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007347
Inada Naoki770847a2019-06-24 12:30:24 +09007348 s += outpos;
7349 int kind = writer.kind;
7350 void *data = writer.data;
7351 Py_ssize_t startinpos, endinpos;
7352
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007353 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007354 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007356 PyUnicode_WRITE(kind, data, writer.pos, c);
7357 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007359 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007361
7362 /* byte outsize range 0x00..0x7f: call the error handler */
7363
7364 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007365 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007366
7367 switch (error_handler)
7368 {
7369 case _Py_ERROR_REPLACE:
7370 case _Py_ERROR_SURROGATEESCAPE:
7371 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007372 but we may switch to UCS2 at the first write */
7373 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7374 goto onError;
7375 kind = writer.kind;
7376 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007377
7378 if (error_handler == _Py_ERROR_REPLACE)
7379 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7380 else
7381 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7382 writer.pos++;
7383 ++s;
7384 break;
7385
7386 case _Py_ERROR_IGNORE:
7387 ++s;
7388 break;
7389
7390 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 startinpos = s-starts;
7392 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007393 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007394 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 "ascii", "ordinal not in range(128)",
7396 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007397 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007399 kind = writer.kind;
7400 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007403 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007404 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007405 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007406
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007408 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007409 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007410 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 return NULL;
7412}
7413
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007414/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007415PyObject *
7416PyUnicode_EncodeASCII(const Py_UNICODE *p,
7417 Py_ssize_t size,
7418 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007420 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007421 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007422 if (unicode == NULL)
7423 return NULL;
7424 result = unicode_encode_ucs1(unicode, errors, 128);
7425 Py_DECREF(unicode);
7426 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427}
7428
Alexander Belopolsky40018472011-02-26 01:02:56 +00007429PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007430_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431{
7432 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 PyErr_BadArgument();
7434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007436 if (PyUnicode_READY(unicode) == -1)
7437 return NULL;
7438 /* Fast path: if it is an ASCII-only string, construct bytes object
7439 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007440 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007441 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7442 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007443 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007444}
7445
7446PyObject *
7447PyUnicode_AsASCIIString(PyObject *unicode)
7448{
7449 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450}
7451
Steve Dowercc16be82016-09-08 10:35:16 -07007452#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007453
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007454/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007455
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007456#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007457#define NEED_RETRY
7458#endif
7459
Steve Dower7ebdda02019-08-21 16:22:33 -07007460/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7461 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7462 both cases also and avoids partial characters overrunning the
7463 length limit in MultiByteToWideChar on Windows */
7464#define DECODING_CHUNK_SIZE (INT_MAX/4)
7465
Victor Stinner3a50e702011-10-18 21:21:00 +02007466#ifndef WC_ERR_INVALID_CHARS
7467# define WC_ERR_INVALID_CHARS 0x0080
7468#endif
7469
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007470static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007471code_page_name(UINT code_page, PyObject **obj)
7472{
7473 *obj = NULL;
7474 if (code_page == CP_ACP)
7475 return "mbcs";
7476 if (code_page == CP_UTF7)
7477 return "CP_UTF7";
7478 if (code_page == CP_UTF8)
7479 return "CP_UTF8";
7480
7481 *obj = PyBytes_FromFormat("cp%u", code_page);
7482 if (*obj == NULL)
7483 return NULL;
7484 return PyBytes_AS_STRING(*obj);
7485}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486
Victor Stinner3a50e702011-10-18 21:21:00 +02007487static DWORD
7488decode_code_page_flags(UINT code_page)
7489{
7490 if (code_page == CP_UTF7) {
7491 /* The CP_UTF7 decoder only supports flags=0 */
7492 return 0;
7493 }
7494 else
7495 return MB_ERR_INVALID_CHARS;
7496}
7497
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007498/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 * Decode a byte string from a Windows code page into unicode object in strict
7500 * mode.
7501 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007502 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7503 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007504 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007505static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007506decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007507 wchar_t **buf,
7508 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 const char *in,
7510 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007511{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007512 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007513 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007515
7516 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007517 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007518 while ((outsize = MultiByteToWideChar(code_page, flags,
7519 in, insize, NULL, 0)) <= 0)
7520 {
7521 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7522 goto error;
7523 }
7524 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7525 flags = 0;
7526 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007527
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007528 /* Extend a wchar_t* buffer */
7529 Py_ssize_t n = *bufsize; /* Get the current length */
7530 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7531 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007532 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007533 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007534
7535 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7537 if (outsize <= 0)
7538 goto error;
7539 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007540
Victor Stinner3a50e702011-10-18 21:21:00 +02007541error:
7542 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7543 return -2;
7544 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007545 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007546}
7547
Victor Stinner3a50e702011-10-18 21:21:00 +02007548/*
7549 * Decode a byte string from a code page into unicode object with an error
7550 * handler.
7551 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007552 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007553 * UnicodeDecodeError exception and returns -1 on error.
7554 */
7555static int
7556decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007557 wchar_t **buf,
7558 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007559 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007560 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007561{
7562 const char *startin = in;
7563 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007564 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 /* Ideally, we should get reason from FormatMessage. This is the Windows
7566 2000 English version of the message. */
7567 const char *reason = "No mapping for the Unicode character exists "
7568 "in the target code page.";
7569 /* each step cannot decode more than 1 character, but a character can be
7570 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007571 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007572 int insize;
7573 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 PyObject *errorHandler = NULL;
7575 PyObject *exc = NULL;
7576 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007577 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 DWORD err;
7579 int ret = -1;
7580
7581 assert(size > 0);
7582
7583 encoding = code_page_name(code_page, &encoding_obj);
7584 if (encoding == NULL)
7585 return -1;
7586
Victor Stinner7d00cc12014-03-17 23:08:06 +01007587 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007588 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7589 UnicodeDecodeError. */
7590 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7591 if (exc != NULL) {
7592 PyCodec_StrictErrors(exc);
7593 Py_CLEAR(exc);
7594 }
7595 goto error;
7596 }
7597
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007598 /* Extend a wchar_t* buffer */
7599 Py_ssize_t n = *bufsize; /* Get the current length */
7600 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7601 PyErr_NoMemory();
7602 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007604 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7605 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007607 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007608
7609 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007610 while (in < endin)
7611 {
7612 /* Decode a character */
7613 insize = 1;
7614 do
7615 {
7616 outsize = MultiByteToWideChar(code_page, flags,
7617 in, insize,
7618 buffer, Py_ARRAY_LENGTH(buffer));
7619 if (outsize > 0)
7620 break;
7621 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007622 if (err == ERROR_INVALID_FLAGS && flags) {
7623 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7624 flags = 0;
7625 continue;
7626 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 if (err != ERROR_NO_UNICODE_TRANSLATION
7628 && err != ERROR_INSUFFICIENT_BUFFER)
7629 {
7630 PyErr_SetFromWindowsErr(0);
7631 goto error;
7632 }
7633 insize++;
7634 }
7635 /* 4=maximum length of a UTF-8 sequence */
7636 while (insize <= 4 && (in + insize) <= endin);
7637
7638 if (outsize <= 0) {
7639 Py_ssize_t startinpos, endinpos, outpos;
7640
Victor Stinner7d00cc12014-03-17 23:08:06 +01007641 /* last character in partial decode? */
7642 if (in + insize >= endin && !final)
7643 break;
7644
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 startinpos = in - startin;
7646 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007647 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007648 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 errors, &errorHandler,
7650 encoding, reason,
7651 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007652 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007653 {
7654 goto error;
7655 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007656 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007657 }
7658 else {
7659 in += insize;
7660 memcpy(out, buffer, outsize * sizeof(wchar_t));
7661 out += outsize;
7662 }
7663 }
7664
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007665 /* Shrink the buffer */
7666 assert(out - *buf <= *bufsize);
7667 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007668 /* (in - startin) <= size and size is an int */
7669 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007670
7671error:
7672 Py_XDECREF(encoding_obj);
7673 Py_XDECREF(errorHandler);
7674 Py_XDECREF(exc);
7675 return ret;
7676}
7677
Victor Stinner3a50e702011-10-18 21:21:00 +02007678static PyObject *
7679decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007680 const char *s, Py_ssize_t size,
7681 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007682{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007683 wchar_t *buf = NULL;
7684 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007685 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007686
Victor Stinner3a50e702011-10-18 21:21:00 +02007687 if (code_page < 0) {
7688 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7689 return NULL;
7690 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007691 if (size < 0) {
7692 PyErr_BadInternalCall();
7693 return NULL;
7694 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007695
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007696 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007698
Victor Stinner76a31a62011-11-04 00:05:13 +01007699 do
7700 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007701#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007702 if (size > DECODING_CHUNK_SIZE) {
7703 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007704 final = 0;
7705 done = 0;
7706 }
7707 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007708#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007709 {
7710 chunk_size = (int)size;
7711 final = (consumed == NULL);
7712 done = 1;
7713 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007714
Victor Stinner76a31a62011-11-04 00:05:13 +01007715 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007716 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007717 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007718 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007719 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007720
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007721 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007722 s, chunk_size);
7723 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007724 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007725 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007726 errors, final);
7727 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007728
7729 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007730 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007731 return NULL;
7732 }
7733
7734 if (consumed)
7735 *consumed += converted;
7736
7737 s += converted;
7738 size -= converted;
7739 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007740
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007741 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7742 PyMem_Free(buf);
7743 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007744}
7745
Alexander Belopolsky40018472011-02-26 01:02:56 +00007746PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007747PyUnicode_DecodeCodePageStateful(int code_page,
7748 const char *s,
7749 Py_ssize_t size,
7750 const char *errors,
7751 Py_ssize_t *consumed)
7752{
7753 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7754}
7755
7756PyObject *
7757PyUnicode_DecodeMBCSStateful(const char *s,
7758 Py_ssize_t size,
7759 const char *errors,
7760 Py_ssize_t *consumed)
7761{
7762 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7763}
7764
7765PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007766PyUnicode_DecodeMBCS(const char *s,
7767 Py_ssize_t size,
7768 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007769{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007770 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7771}
7772
Victor Stinner3a50e702011-10-18 21:21:00 +02007773static DWORD
7774encode_code_page_flags(UINT code_page, const char *errors)
7775{
7776 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007777 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007778 }
7779 else if (code_page == CP_UTF7) {
7780 /* CP_UTF7 only supports flags=0 */
7781 return 0;
7782 }
7783 else {
7784 if (errors != NULL && strcmp(errors, "replace") == 0)
7785 return 0;
7786 else
7787 return WC_NO_BEST_FIT_CHARS;
7788 }
7789}
7790
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007791/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007792 * Encode a Unicode string to a Windows code page into a byte string in strict
7793 * mode.
7794 *
7795 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007796 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007797 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007798static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007799encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007800 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007801 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007802{
Victor Stinner554f3f02010-06-16 23:33:54 +00007803 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007804 BOOL *pusedDefaultChar = &usedDefaultChar;
7805 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007806 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007807 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007808 const DWORD flags = encode_code_page_flags(code_page, NULL);
7809 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007810 /* Create a substring so that we can get the UTF-16 representation
7811 of just the slice under consideration. */
7812 PyObject *substring;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007813 int ret = -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007814
Martin v. Löwis3d325192011-11-04 18:23:06 +01007815 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007816
Victor Stinner3a50e702011-10-18 21:21:00 +02007817 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007818 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007819 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007820 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007821
Victor Stinner2fc507f2011-11-04 20:06:39 +01007822 substring = PyUnicode_Substring(unicode, offset, offset+len);
7823 if (substring == NULL)
7824 return -1;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007825#if USE_UNICODE_WCHAR_CACHE
7826_Py_COMP_DIAG_PUSH
7827_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner2fc507f2011-11-04 20:06:39 +01007828 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7829 if (p == NULL) {
7830 Py_DECREF(substring);
7831 return -1;
7832 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007833_Py_COMP_DIAG_POP
7834#else /* USE_UNICODE_WCHAR_CACHE */
7835 p = PyUnicode_AsWideCharString(substring, &size);
7836 Py_CLEAR(substring);
7837 if (p == NULL) {
7838 return -1;
7839 }
7840#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinner9f067f42013-06-05 00:21:31 +02007841 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007842
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007843 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007844 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007845 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007846 NULL, 0,
7847 NULL, pusedDefaultChar);
7848 if (outsize <= 0)
7849 goto error;
7850 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007851 if (pusedDefaultChar && *pusedDefaultChar) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007852 ret = -2;
7853 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007854 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007855
Victor Stinner3a50e702011-10-18 21:21:00 +02007856 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007858 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007859 if (*outbytes == NULL) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007860 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007861 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007862 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007863 }
7864 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007866 const Py_ssize_t n = PyBytes_Size(*outbytes);
7867 if (outsize > PY_SSIZE_T_MAX - n) {
7868 PyErr_NoMemory();
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007869 goto done;
Victor Stinner3a50e702011-10-18 21:21:00 +02007870 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007871 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007872 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007873 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007874 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007875 }
7876
7877 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007878 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007879 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007880 out, outsize,
7881 NULL, pusedDefaultChar);
7882 if (outsize <= 0)
7883 goto error;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007884 if (pusedDefaultChar && *pusedDefaultChar) {
7885 ret = -2;
7886 goto done;
7887 }
7888 ret = 0;
7889
7890done:
7891#if USE_UNICODE_WCHAR_CACHE
7892 Py_DECREF(substring);
7893#else /* USE_UNICODE_WCHAR_CACHE */
7894 PyMem_Free(p);
7895#endif /* USE_UNICODE_WCHAR_CACHE */
7896 return ret;
Victor Stinner554f3f02010-06-16 23:33:54 +00007897
Victor Stinner3a50e702011-10-18 21:21:00 +02007898error:
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007899 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7900 ret = -2;
7901 goto done;
7902 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007903 PyErr_SetFromWindowsErr(0);
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007904 goto done;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007905}
7906
Victor Stinner3a50e702011-10-18 21:21:00 +02007907/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007908 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007909 * error handler.
7910 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007911 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007912 * -1 on other error.
7913 */
7914static int
7915encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007916 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007917 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007918{
Victor Stinner3a50e702011-10-18 21:21:00 +02007919 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007920 Py_ssize_t pos = unicode_offset;
7921 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007922 /* Ideally, we should get reason from FormatMessage. This is the Windows
7923 2000 English version of the message. */
7924 const char *reason = "invalid character";
7925 /* 4=maximum length of a UTF-8 sequence */
7926 char buffer[4];
7927 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7928 Py_ssize_t outsize;
7929 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007930 PyObject *errorHandler = NULL;
7931 PyObject *exc = NULL;
7932 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007933 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007934 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007935 PyObject *rep;
7936 int ret = -1;
7937
7938 assert(insize > 0);
7939
7940 encoding = code_page_name(code_page, &encoding_obj);
7941 if (encoding == NULL)
7942 return -1;
7943
7944 if (errors == NULL || strcmp(errors, "strict") == 0) {
7945 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7946 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007947 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007948 if (exc != NULL) {
7949 PyCodec_StrictErrors(exc);
7950 Py_DECREF(exc);
7951 }
7952 Py_XDECREF(encoding_obj);
7953 return -1;
7954 }
7955
7956 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7957 pusedDefaultChar = &usedDefaultChar;
7958 else
7959 pusedDefaultChar = NULL;
7960
7961 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7962 PyErr_NoMemory();
7963 goto error;
7964 }
7965 outsize = insize * Py_ARRAY_LENGTH(buffer);
7966
7967 if (*outbytes == NULL) {
7968 /* Create string object */
7969 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7970 if (*outbytes == NULL)
7971 goto error;
7972 out = PyBytes_AS_STRING(*outbytes);
7973 }
7974 else {
7975 /* Extend string object */
7976 Py_ssize_t n = PyBytes_Size(*outbytes);
7977 if (n > PY_SSIZE_T_MAX - outsize) {
7978 PyErr_NoMemory();
7979 goto error;
7980 }
7981 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7982 goto error;
7983 out = PyBytes_AS_STRING(*outbytes) + n;
7984 }
7985
7986 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007987 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007988 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007989 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7990 wchar_t chars[2];
7991 int charsize;
7992 if (ch < 0x10000) {
7993 chars[0] = (wchar_t)ch;
7994 charsize = 1;
7995 }
7996 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007997 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7998 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007999 charsize = 2;
8000 }
8001
Victor Stinner3a50e702011-10-18 21:21:00 +02008002 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008003 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02008004 buffer, Py_ARRAY_LENGTH(buffer),
8005 NULL, pusedDefaultChar);
8006 if (outsize > 0) {
8007 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8008 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008009 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02008010 memcpy(out, buffer, outsize);
8011 out += outsize;
8012 continue;
8013 }
8014 }
8015 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8016 PyErr_SetFromWindowsErr(0);
8017 goto error;
8018 }
8019
Victor Stinner3a50e702011-10-18 21:21:00 +02008020 rep = unicode_encode_call_errorhandler(
8021 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01008022 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008023 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02008024 if (rep == NULL)
8025 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008026 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02008027
8028 if (PyBytes_Check(rep)) {
8029 outsize = PyBytes_GET_SIZE(rep);
8030 if (outsize != 1) {
8031 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8032 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8033 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8034 Py_DECREF(rep);
8035 goto error;
8036 }
8037 out = PyBytes_AS_STRING(*outbytes) + offset;
8038 }
8039 memcpy(out, PyBytes_AS_STRING(rep), outsize);
8040 out += outsize;
8041 }
8042 else {
8043 Py_ssize_t i;
8044 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008045 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02008046
Benjamin Petersonbac79492012-01-14 13:34:47 -05008047 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02008048 Py_DECREF(rep);
8049 goto error;
8050 }
8051
8052 outsize = PyUnicode_GET_LENGTH(rep);
8053 if (outsize != 1) {
8054 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8055 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8056 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8057 Py_DECREF(rep);
8058 goto error;
8059 }
8060 out = PyBytes_AS_STRING(*outbytes) + offset;
8061 }
8062 kind = PyUnicode_KIND(rep);
8063 data = PyUnicode_DATA(rep);
8064 for (i=0; i < outsize; i++) {
8065 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8066 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008067 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008068 encoding, unicode,
8069 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02008070 "unable to encode error handler result to ASCII");
8071 Py_DECREF(rep);
8072 goto error;
8073 }
8074 *out = (unsigned char)ch;
8075 out++;
8076 }
8077 }
8078 Py_DECREF(rep);
8079 }
8080 /* write a NUL byte */
8081 *out = 0;
8082 outsize = out - PyBytes_AS_STRING(*outbytes);
8083 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8084 if (_PyBytes_Resize(outbytes, outsize) < 0)
8085 goto error;
8086 ret = 0;
8087
8088error:
8089 Py_XDECREF(encoding_obj);
8090 Py_XDECREF(errorHandler);
8091 Py_XDECREF(exc);
8092 return ret;
8093}
8094
Victor Stinner3a50e702011-10-18 21:21:00 +02008095static PyObject *
8096encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008097 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008098 const char *errors)
8099{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008100 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008101 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008102 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008103 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008104
Victor Stinner29dacf22015-01-26 16:41:32 +01008105 if (!PyUnicode_Check(unicode)) {
8106 PyErr_BadArgument();
8107 return NULL;
8108 }
8109
Benjamin Petersonbac79492012-01-14 13:34:47 -05008110 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008111 return NULL;
8112 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008113
Victor Stinner3a50e702011-10-18 21:21:00 +02008114 if (code_page < 0) {
8115 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8116 return NULL;
8117 }
8118
Martin v. Löwis3d325192011-11-04 18:23:06 +01008119 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008120 return PyBytes_FromStringAndSize(NULL, 0);
8121
Victor Stinner7581cef2011-11-03 22:32:33 +01008122 offset = 0;
8123 do
8124 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008125#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008126 if (len > DECODING_CHUNK_SIZE) {
8127 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008128 done = 0;
8129 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008130 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008131#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008132 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008133 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008134 done = 1;
8135 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008136
Victor Stinner76a31a62011-11-04 00:05:13 +01008137 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008138 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008139 errors);
8140 if (ret == -2)
8141 ret = encode_code_page_errors(code_page, &outbytes,
8142 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008143 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008144 if (ret < 0) {
8145 Py_XDECREF(outbytes);
8146 return NULL;
8147 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008148
Victor Stinner7581cef2011-11-03 22:32:33 +01008149 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008150 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008151 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008152
Victor Stinner3a50e702011-10-18 21:21:00 +02008153 return outbytes;
8154}
8155
8156PyObject *
8157PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8158 Py_ssize_t size,
8159 const char *errors)
8160{
Victor Stinner7581cef2011-11-03 22:32:33 +01008161 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008162 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008163 if (unicode == NULL)
8164 return NULL;
8165 res = encode_code_page(CP_ACP, unicode, errors);
8166 Py_DECREF(unicode);
8167 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008168}
8169
8170PyObject *
8171PyUnicode_EncodeCodePage(int code_page,
8172 PyObject *unicode,
8173 const char *errors)
8174{
Victor Stinner7581cef2011-11-03 22:32:33 +01008175 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008176}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008177
Alexander Belopolsky40018472011-02-26 01:02:56 +00008178PyObject *
8179PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008180{
Victor Stinner7581cef2011-11-03 22:32:33 +01008181 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008182}
8183
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008184#undef NEED_RETRY
8185
Steve Dowercc16be82016-09-08 10:35:16 -07008186#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008187
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188/* --- Character Mapping Codec -------------------------------------------- */
8189
Victor Stinnerfb161b12013-04-18 01:44:27 +02008190static int
8191charmap_decode_string(const char *s,
8192 Py_ssize_t size,
8193 PyObject *mapping,
8194 const char *errors,
8195 _PyUnicodeWriter *writer)
8196{
8197 const char *starts = s;
8198 const char *e;
8199 Py_ssize_t startinpos, endinpos;
8200 PyObject *errorHandler = NULL, *exc = NULL;
8201 Py_ssize_t maplen;
8202 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008203 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008204 Py_UCS4 x;
8205 unsigned char ch;
8206
8207 if (PyUnicode_READY(mapping) == -1)
8208 return -1;
8209
8210 maplen = PyUnicode_GET_LENGTH(mapping);
8211 mapdata = PyUnicode_DATA(mapping);
8212 mapkind = PyUnicode_KIND(mapping);
8213
8214 e = s + size;
8215
8216 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8217 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8218 * is disabled in encoding aliases, latin1 is preferred because
8219 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008220 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008221 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8222 Py_UCS4 maxchar = writer->maxchar;
8223
8224 assert (writer->kind == PyUnicode_1BYTE_KIND);
8225 while (s < e) {
8226 ch = *s;
8227 x = mapdata_ucs1[ch];
8228 if (x > maxchar) {
8229 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8230 goto onError;
8231 maxchar = writer->maxchar;
8232 outdata = (Py_UCS1 *)writer->data;
8233 }
8234 outdata[writer->pos] = x;
8235 writer->pos++;
8236 ++s;
8237 }
8238 return 0;
8239 }
8240
8241 while (s < e) {
8242 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8243 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008244 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008245 if (outkind == PyUnicode_1BYTE_KIND) {
8246 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8247 Py_UCS4 maxchar = writer->maxchar;
8248 while (s < e) {
8249 ch = *s;
8250 x = mapdata_ucs2[ch];
8251 if (x > maxchar)
8252 goto Error;
8253 outdata[writer->pos] = x;
8254 writer->pos++;
8255 ++s;
8256 }
8257 break;
8258 }
8259 else if (outkind == PyUnicode_2BYTE_KIND) {
8260 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8261 while (s < e) {
8262 ch = *s;
8263 x = mapdata_ucs2[ch];
8264 if (x == 0xFFFE)
8265 goto Error;
8266 outdata[writer->pos] = x;
8267 writer->pos++;
8268 ++s;
8269 }
8270 break;
8271 }
8272 }
8273 ch = *s;
8274
8275 if (ch < maplen)
8276 x = PyUnicode_READ(mapkind, mapdata, ch);
8277 else
8278 x = 0xfffe; /* invalid value */
8279Error:
8280 if (x == 0xfffe)
8281 {
8282 /* undefined mapping */
8283 startinpos = s-starts;
8284 endinpos = startinpos+1;
8285 if (unicode_decode_call_errorhandler_writer(
8286 errors, &errorHandler,
8287 "charmap", "character maps to <undefined>",
8288 &starts, &e, &startinpos, &endinpos, &exc, &s,
8289 writer)) {
8290 goto onError;
8291 }
8292 continue;
8293 }
8294
8295 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8296 goto onError;
8297 ++s;
8298 }
8299 Py_XDECREF(errorHandler);
8300 Py_XDECREF(exc);
8301 return 0;
8302
8303onError:
8304 Py_XDECREF(errorHandler);
8305 Py_XDECREF(exc);
8306 return -1;
8307}
8308
8309static int
8310charmap_decode_mapping(const char *s,
8311 Py_ssize_t size,
8312 PyObject *mapping,
8313 const char *errors,
8314 _PyUnicodeWriter *writer)
8315{
8316 const char *starts = s;
8317 const char *e;
8318 Py_ssize_t startinpos, endinpos;
8319 PyObject *errorHandler = NULL, *exc = NULL;
8320 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008321 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008322
8323 e = s + size;
8324
8325 while (s < e) {
8326 ch = *s;
8327
8328 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8329 key = PyLong_FromLong((long)ch);
8330 if (key == NULL)
8331 goto onError;
8332
8333 item = PyObject_GetItem(mapping, key);
8334 Py_DECREF(key);
8335 if (item == NULL) {
8336 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8337 /* No mapping found means: mapping is undefined. */
8338 PyErr_Clear();
8339 goto Undefined;
8340 } else
8341 goto onError;
8342 }
8343
8344 /* Apply mapping */
8345 if (item == Py_None)
8346 goto Undefined;
8347 if (PyLong_Check(item)) {
8348 long value = PyLong_AS_LONG(item);
8349 if (value == 0xFFFE)
8350 goto Undefined;
8351 if (value < 0 || value > MAX_UNICODE) {
8352 PyErr_Format(PyExc_TypeError,
Max Bernstein36353882020-10-17 13:38:21 -07008353 "character mapping must be in range(0x%x)",
Victor Stinnerfb161b12013-04-18 01:44:27 +02008354 (unsigned long)MAX_UNICODE + 1);
8355 goto onError;
8356 }
8357
8358 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8359 goto onError;
8360 }
8361 else if (PyUnicode_Check(item)) {
8362 if (PyUnicode_READY(item) == -1)
8363 goto onError;
8364 if (PyUnicode_GET_LENGTH(item) == 1) {
8365 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8366 if (value == 0xFFFE)
8367 goto Undefined;
8368 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8369 goto onError;
8370 }
8371 else {
8372 writer->overallocate = 1;
8373 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8374 goto onError;
8375 }
8376 }
8377 else {
8378 /* wrong return value */
8379 PyErr_SetString(PyExc_TypeError,
8380 "character mapping must return integer, None or str");
8381 goto onError;
8382 }
8383 Py_CLEAR(item);
8384 ++s;
8385 continue;
8386
8387Undefined:
8388 /* undefined mapping */
8389 Py_CLEAR(item);
8390 startinpos = s-starts;
8391 endinpos = startinpos+1;
8392 if (unicode_decode_call_errorhandler_writer(
8393 errors, &errorHandler,
8394 "charmap", "character maps to <undefined>",
8395 &starts, &e, &startinpos, &endinpos, &exc, &s,
8396 writer)) {
8397 goto onError;
8398 }
8399 }
8400 Py_XDECREF(errorHandler);
8401 Py_XDECREF(exc);
8402 return 0;
8403
8404onError:
8405 Py_XDECREF(item);
8406 Py_XDECREF(errorHandler);
8407 Py_XDECREF(exc);
8408 return -1;
8409}
8410
Alexander Belopolsky40018472011-02-26 01:02:56 +00008411PyObject *
8412PyUnicode_DecodeCharmap(const char *s,
8413 Py_ssize_t size,
8414 PyObject *mapping,
8415 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008417 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008418
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 /* Default to Latin-1 */
8420 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008424 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008425 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008426 writer.min_length = size;
8427 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008429
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008430 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008431 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8432 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008433 }
8434 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008435 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8436 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008438 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008439
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008441 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 return NULL;
8443}
8444
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008445/* Charmap encoding: the lookup table */
8446
Alexander Belopolsky40018472011-02-26 01:02:56 +00008447struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 PyObject_HEAD
8449 unsigned char level1[32];
8450 int count2, count3;
8451 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008452};
8453
8454static PyObject*
8455encoding_map_size(PyObject *obj, PyObject* args)
8456{
8457 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008460}
8461
8462static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 PyDoc_STR("Return the size (in bytes) of this object") },
8465 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008466};
8467
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008468static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008469 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 "EncodingMap", /*tp_name*/
8471 sizeof(struct encoding_map), /*tp_basicsize*/
8472 0, /*tp_itemsize*/
8473 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008474 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008475 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 0, /*tp_getattr*/
8477 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008478 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 0, /*tp_repr*/
8480 0, /*tp_as_number*/
8481 0, /*tp_as_sequence*/
8482 0, /*tp_as_mapping*/
8483 0, /*tp_hash*/
8484 0, /*tp_call*/
8485 0, /*tp_str*/
8486 0, /*tp_getattro*/
8487 0, /*tp_setattro*/
8488 0, /*tp_as_buffer*/
8489 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8490 0, /*tp_doc*/
8491 0, /*tp_traverse*/
8492 0, /*tp_clear*/
8493 0, /*tp_richcompare*/
8494 0, /*tp_weaklistoffset*/
8495 0, /*tp_iter*/
8496 0, /*tp_iternext*/
8497 encoding_map_methods, /*tp_methods*/
8498 0, /*tp_members*/
8499 0, /*tp_getset*/
8500 0, /*tp_base*/
8501 0, /*tp_dict*/
8502 0, /*tp_descr_get*/
8503 0, /*tp_descr_set*/
8504 0, /*tp_dictoffset*/
8505 0, /*tp_init*/
8506 0, /*tp_alloc*/
8507 0, /*tp_new*/
8508 0, /*tp_free*/
8509 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008510};
8511
8512PyObject*
8513PyUnicode_BuildEncodingMap(PyObject* string)
8514{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008515 PyObject *result;
8516 struct encoding_map *mresult;
8517 int i;
8518 int need_dict = 0;
8519 unsigned char level1[32];
8520 unsigned char level2[512];
8521 unsigned char *mlevel1, *mlevel2, *mlevel3;
8522 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008524 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008525 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008527
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008528 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008529 PyErr_BadArgument();
8530 return NULL;
8531 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 kind = PyUnicode_KIND(string);
8533 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008534 length = PyUnicode_GET_LENGTH(string);
8535 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008536 memset(level1, 0xFF, sizeof level1);
8537 memset(level2, 0xFF, sizeof level2);
8538
8539 /* If there isn't a one-to-one mapping of NULL to \0,
8540 or if there are non-BMP characters, we need to use
8541 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008543 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008544 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008545 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 ch = PyUnicode_READ(kind, data, i);
8547 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008548 need_dict = 1;
8549 break;
8550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008552 /* unmapped character */
8553 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 l1 = ch >> 11;
8555 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008556 if (level1[l1] == 0xFF)
8557 level1[l1] = count2++;
8558 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008559 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008560 }
8561
8562 if (count2 >= 0xFF || count3 >= 0xFF)
8563 need_dict = 1;
8564
8565 if (need_dict) {
8566 PyObject *result = PyDict_New();
8567 PyObject *key, *value;
8568 if (!result)
8569 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008570 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008572 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008573 if (!key || !value)
8574 goto failed1;
8575 if (PyDict_SetItem(result, key, value) == -1)
8576 goto failed1;
8577 Py_DECREF(key);
8578 Py_DECREF(value);
8579 }
8580 return result;
8581 failed1:
8582 Py_XDECREF(key);
8583 Py_XDECREF(value);
8584 Py_DECREF(result);
8585 return NULL;
8586 }
8587
8588 /* Create a three-level trie */
Victor Stinner32bd68c2020-12-01 10:37:39 +01008589 result = PyObject_Malloc(sizeof(struct encoding_map) +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008590 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008591 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008592 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008593 }
8594
8595 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008596 mresult = (struct encoding_map*)result;
8597 mresult->count2 = count2;
8598 mresult->count3 = count3;
8599 mlevel1 = mresult->level1;
8600 mlevel2 = mresult->level23;
8601 mlevel3 = mresult->level23 + 16*count2;
8602 memcpy(mlevel1, level1, 32);
8603 memset(mlevel2, 0xFF, 16*count2);
8604 memset(mlevel3, 0, 128*count3);
8605 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008606 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008607 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008608 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8609 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008610 /* unmapped character */
8611 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008612 o1 = ch>>11;
8613 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008614 i2 = 16*mlevel1[o1] + o2;
8615 if (mlevel2[i2] == 0xFF)
8616 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008617 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008618 i3 = 128*mlevel2[i2] + o3;
8619 mlevel3[i3] = i;
8620 }
8621 return result;
8622}
8623
8624static int
Victor Stinner22168992011-11-20 17:09:18 +01008625encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008626{
8627 struct encoding_map *map = (struct encoding_map*)mapping;
8628 int l1 = c>>11;
8629 int l2 = (c>>7) & 0xF;
8630 int l3 = c & 0x7F;
8631 int i;
8632
Victor Stinner22168992011-11-20 17:09:18 +01008633 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008635 if (c == 0)
8636 return 0;
8637 /* level 1*/
8638 i = map->level1[l1];
8639 if (i == 0xFF) {
8640 return -1;
8641 }
8642 /* level 2*/
8643 i = map->level23[16*i+l2];
8644 if (i == 0xFF) {
8645 return -1;
8646 }
8647 /* level 3 */
8648 i = map->level23[16*map->count2 + 128*i + l3];
8649 if (i == 0) {
8650 return -1;
8651 }
8652 return i;
8653}
8654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655/* Lookup the character ch in the mapping. If the character
8656 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008657 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008659charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660{
Christian Heimes217cfd12007-12-02 14:31:20 +00008661 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662 PyObject *x;
8663
8664 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 x = PyObject_GetItem(mapping, w);
8667 Py_DECREF(w);
8668 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8670 /* No mapping found means: mapping is undefined. */
8671 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008672 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 } else
8674 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008676 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008678 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 long value = PyLong_AS_LONG(x);
8680 if (value < 0 || value > 255) {
8681 PyErr_SetString(PyExc_TypeError,
8682 "character mapping must be in range(256)");
8683 Py_DECREF(x);
8684 return NULL;
8685 }
8686 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008688 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 /* wrong return value */
8692 PyErr_Format(PyExc_TypeError,
8693 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008694 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 Py_DECREF(x);
8696 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 }
8698}
8699
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008700static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008701charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008702{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008703 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8704 /* exponentially overallocate to minimize reallocations */
8705 if (requiredsize < 2*outsize)
8706 requiredsize = 2*outsize;
8707 if (_PyBytes_Resize(outobj, requiredsize))
8708 return -1;
8709 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008710}
8711
Benjamin Peterson14339b62009-01-31 16:36:08 +00008712typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008714} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008716 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717 space is available. Return a new reference to the object that
8718 was put in the output buffer, or Py_None, if the mapping was undefined
8719 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008720 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008721static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008722charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008723 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008725 PyObject *rep;
8726 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008727 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728
Andy Lesterdffe4c02020-03-04 07:15:20 -06008729 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008730 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008732 if (res == -1)
8733 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 if (outsize<requiredsize)
8735 if (charmapencode_resize(outobj, outpos, requiredsize))
8736 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008737 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 outstart[(*outpos)++] = (char)res;
8739 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008740 }
8741
8742 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008743 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008745 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 Py_DECREF(rep);
8747 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008748 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 if (PyLong_Check(rep)) {
8750 Py_ssize_t requiredsize = *outpos+1;
8751 if (outsize<requiredsize)
8752 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8753 Py_DECREF(rep);
8754 return enc_EXCEPTION;
8755 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008756 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008758 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 else {
8760 const char *repchars = PyBytes_AS_STRING(rep);
8761 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8762 Py_ssize_t requiredsize = *outpos+repsize;
8763 if (outsize<requiredsize)
8764 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8765 Py_DECREF(rep);
8766 return enc_EXCEPTION;
8767 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008768 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 memcpy(outstart + *outpos, repchars, repsize);
8770 *outpos += repsize;
8771 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008773 Py_DECREF(rep);
8774 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775}
8776
8777/* handle an error in PyUnicode_EncodeCharmap
8778 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008779static int
8780charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008781 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008782 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008783 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008784 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785{
8786 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008787 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008788 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008789 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008790 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008791 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008793 Py_ssize_t collstartpos = *inpos;
8794 Py_ssize_t collendpos = *inpos+1;
8795 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008796 const char *encoding = "charmap";
8797 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008798 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008799 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008800 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008801
Benjamin Petersonbac79492012-01-14 13:34:47 -05008802 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008803 return -1;
8804 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008805 /* find all unencodable characters */
8806 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008807 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008808 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008809 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008810 val = encoding_map_lookup(ch, mapping);
8811 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 break;
8813 ++collendpos;
8814 continue;
8815 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008816
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008817 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8818 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008819 if (rep==NULL)
8820 return -1;
8821 else if (rep!=Py_None) {
8822 Py_DECREF(rep);
8823 break;
8824 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008825 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008827 }
8828 /* cache callback name lookup
8829 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008830 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008831 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008832
8833 switch (*error_handler) {
8834 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008835 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008836 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008837
8838 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008839 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 x = charmapencode_output('?', mapping, res, respos);
8841 if (x==enc_EXCEPTION) {
8842 return -1;
8843 }
8844 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008845 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008846 return -1;
8847 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008848 }
8849 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008850 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008851 *inpos = collendpos;
8852 break;
Victor Stinner50149202015-09-22 00:26:54 +02008853
8854 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008855 /* generate replacement (temporarily (mis)uses p) */
8856 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008857 char buffer[2+29+1+1];
8858 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008859 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008860 for (cp = buffer; *cp; ++cp) {
8861 x = charmapencode_output(*cp, mapping, res, respos);
8862 if (x==enc_EXCEPTION)
8863 return -1;
8864 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008865 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008866 return -1;
8867 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008868 }
8869 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008870 *inpos = collendpos;
8871 break;
Victor Stinner50149202015-09-22 00:26:54 +02008872
Benjamin Peterson14339b62009-01-31 16:36:08 +00008873 default:
Victor Stinner50149202015-09-22 00:26:54 +02008874 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008875 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008877 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008879 if (PyBytes_Check(repunicode)) {
8880 /* Directly copy bytes result to output. */
8881 Py_ssize_t outsize = PyBytes_Size(*res);
8882 Py_ssize_t requiredsize;
8883 repsize = PyBytes_Size(repunicode);
8884 requiredsize = *respos + repsize;
8885 if (requiredsize > outsize)
8886 /* Make room for all additional bytes. */
8887 if (charmapencode_resize(res, respos, requiredsize)) {
8888 Py_DECREF(repunicode);
8889 return -1;
8890 }
8891 memcpy(PyBytes_AsString(*res) + *respos,
8892 PyBytes_AsString(repunicode), repsize);
8893 *respos += repsize;
8894 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008895 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008896 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008897 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008898 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008899 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008900 Py_DECREF(repunicode);
8901 return -1;
8902 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008903 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008904 data = PyUnicode_DATA(repunicode);
8905 kind = PyUnicode_KIND(repunicode);
8906 for (index = 0; index < repsize; index++) {
8907 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8908 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008910 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 return -1;
8912 }
8913 else if (x==enc_FAILED) {
8914 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008915 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 return -1;
8917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008918 }
8919 *inpos = newpos;
8920 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008921 }
8922 return 0;
8923}
8924
Alexander Belopolsky40018472011-02-26 01:02:56 +00008925PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008926_PyUnicode_EncodeCharmap(PyObject *unicode,
8927 PyObject *mapping,
8928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008930 /* output object */
8931 PyObject *res = NULL;
8932 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008933 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008934 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008935 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008936 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008937 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008938 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008939 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008940 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008941 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942
Benjamin Petersonbac79492012-01-14 13:34:47 -05008943 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008944 return NULL;
8945 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008946 data = PyUnicode_DATA(unicode);
8947 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008948
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 /* Default to Latin-1 */
8950 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008951 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008953 /* allocate enough for a simple encoding without
8954 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008955 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008956 if (res == NULL)
8957 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008958 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008961 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008962 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008964 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 if (x==enc_EXCEPTION) /* error */
8966 goto onError;
8967 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008968 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008970 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 &res, &respos)) {
8972 goto onError;
8973 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008974 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 else
8976 /* done with this character => adjust input position */
8977 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008980 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008981 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008982 if (_PyBytes_Resize(&res, respos) < 0)
8983 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008984
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008985 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008986 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008987 return res;
8988
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008990 Py_XDECREF(res);
8991 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008992 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993 return NULL;
8994}
8995
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008996/* Deprecated */
8997PyObject *
8998PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8999 Py_ssize_t size,
9000 PyObject *mapping,
9001 const char *errors)
9002{
9003 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009004 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009005 if (unicode == NULL)
9006 return NULL;
9007 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
9008 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01009009 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009010}
9011
Alexander Belopolsky40018472011-02-26 01:02:56 +00009012PyObject *
9013PyUnicode_AsCharmapString(PyObject *unicode,
9014 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015{
9016 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 PyErr_BadArgument();
9018 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009020 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021}
9022
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009023/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009024static void
9025make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009027 Py_ssize_t startpos, Py_ssize_t endpos,
9028 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009030 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 *exceptionObject = _PyUnicodeTranslateError_Create(
9032 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 }
9034 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9036 goto onError;
9037 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9038 goto onError;
9039 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9040 goto onError;
9041 return;
9042 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02009043 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 }
9045}
9046
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009047/* error handling callback helper:
9048 build arguments, call the callback and check the arguments,
9049 put the result into newpos and return the replacement string, which
9050 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009051static PyObject *
9052unicode_translate_call_errorhandler(const char *errors,
9053 PyObject **errorHandler,
9054 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009056 Py_ssize_t startpos, Py_ssize_t endpos,
9057 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009058{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009059 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009061 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009062 PyObject *restuple;
9063 PyObject *resunicode;
9064
9065 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009066 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009067 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009069 }
9070
9071 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009073 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009074 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009075
Petr Viktorinffd97532020-02-11 17:46:57 +01009076 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009077 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009078 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009079 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009080 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 Py_DECREF(restuple);
9082 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009083 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009084 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 &resunicode, &i_newpos)) {
9086 Py_DECREF(restuple);
9087 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009088 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009089 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009091 else
9092 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009094 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009095 Py_DECREF(restuple);
9096 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009097 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009098 Py_INCREF(resunicode);
9099 Py_DECREF(restuple);
9100 return resunicode;
9101}
9102
9103/* Lookup the character ch in the mapping and put the result in result,
9104 which must be decrefed by the caller.
9105 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009106static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009108{
Christian Heimes217cfd12007-12-02 14:31:20 +00009109 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009110 PyObject *x;
9111
9112 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009114 x = PyObject_GetItem(mapping, w);
9115 Py_DECREF(w);
9116 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9118 /* No mapping found means: use 1:1 mapping. */
9119 PyErr_Clear();
9120 *result = NULL;
9121 return 0;
9122 } else
9123 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009124 }
9125 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 *result = x;
9127 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009128 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009129 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009131 if (value < 0 || value > MAX_UNICODE) {
9132 PyErr_Format(PyExc_ValueError,
9133 "character mapping must be in range(0x%x)",
9134 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 Py_DECREF(x);
9136 return -1;
9137 }
9138 *result = x;
9139 return 0;
9140 }
9141 else if (PyUnicode_Check(x)) {
9142 *result = x;
9143 return 0;
9144 }
9145 else {
9146 /* wrong return value */
9147 PyErr_SetString(PyExc_TypeError,
9148 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009149 Py_DECREF(x);
9150 return -1;
9151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009152}
Victor Stinner1194ea02014-04-04 19:37:40 +02009153
9154/* lookup the character, write the result into the writer.
9155 Return 1 if the result was written into the writer, return 0 if the mapping
9156 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009157static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009158charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9159 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009160{
Victor Stinner1194ea02014-04-04 19:37:40 +02009161 PyObject *item;
9162
9163 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009165
9166 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009168 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009170 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009171 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009172 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009173
9174 if (item == Py_None) {
9175 Py_DECREF(item);
9176 return 0;
9177 }
9178
9179 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009180 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9181 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9182 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009183 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9184 Py_DECREF(item);
9185 return -1;
9186 }
9187 Py_DECREF(item);
9188 return 1;
9189 }
9190
9191 if (!PyUnicode_Check(item)) {
9192 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009194 }
9195
9196 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9197 Py_DECREF(item);
9198 return -1;
9199 }
9200
9201 Py_DECREF(item);
9202 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009203}
9204
Victor Stinner89a76ab2014-04-05 11:44:04 +02009205static int
9206unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9207 Py_UCS1 *translate)
9208{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009209 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009210 int ret = 0;
9211
Victor Stinner89a76ab2014-04-05 11:44:04 +02009212 if (charmaptranslate_lookup(ch, mapping, &item)) {
9213 return -1;
9214 }
9215
9216 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009217 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009218 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009219 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009220 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009221 /* not found => default to 1:1 mapping */
9222 translate[ch] = ch;
9223 return 1;
9224 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009225 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009226 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009227 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9228 used it */
9229 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009230 /* invalid character or character outside ASCII:
9231 skip the fast translate */
9232 goto exit;
9233 }
9234 translate[ch] = (Py_UCS1)replace;
9235 }
9236 else if (PyUnicode_Check(item)) {
9237 Py_UCS4 replace;
9238
9239 if (PyUnicode_READY(item) == -1) {
9240 Py_DECREF(item);
9241 return -1;
9242 }
9243 if (PyUnicode_GET_LENGTH(item) != 1)
9244 goto exit;
9245
9246 replace = PyUnicode_READ_CHAR(item, 0);
9247 if (replace > 127)
9248 goto exit;
9249 translate[ch] = (Py_UCS1)replace;
9250 }
9251 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009252 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009253 goto exit;
9254 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009255 ret = 1;
9256
Benjamin Peterson1365de72014-04-07 20:15:41 -04009257 exit:
9258 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009259 return ret;
9260}
9261
9262/* Fast path for ascii => ascii translation. Return 1 if the whole string
9263 was translated into writer, return 0 if the input string was partially
9264 translated into writer, raise an exception and return -1 on error. */
9265static int
9266unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009267 _PyUnicodeWriter *writer, int ignore,
9268 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009269{
Victor Stinner872b2912014-04-05 14:27:07 +02009270 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009271 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009272 const Py_UCS1 *in, *end;
9273 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009274 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009275
Victor Stinner89a76ab2014-04-05 11:44:04 +02009276 len = PyUnicode_GET_LENGTH(input);
9277
Victor Stinner872b2912014-04-05 14:27:07 +02009278 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009279
9280 in = PyUnicode_1BYTE_DATA(input);
9281 end = in + len;
9282
9283 assert(PyUnicode_IS_ASCII(writer->buffer));
9284 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9285 out = PyUnicode_1BYTE_DATA(writer->buffer);
9286
Victor Stinner872b2912014-04-05 14:27:07 +02009287 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009288 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009289 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009290 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009291 int translate = unicode_fast_translate_lookup(mapping, ch,
9292 ascii_table);
9293 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009294 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009295 if (translate == 0)
9296 goto exit;
9297 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009298 }
Victor Stinner872b2912014-04-05 14:27:07 +02009299 if (ch2 == 0xfe) {
9300 if (ignore)
9301 continue;
9302 goto exit;
9303 }
9304 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009305 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009306 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009307 }
Victor Stinner872b2912014-04-05 14:27:07 +02009308 res = 1;
9309
9310exit:
9311 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009312 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009313 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009314}
9315
Victor Stinner3222da22015-10-01 22:07:32 +02009316static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317_PyUnicode_TranslateCharmap(PyObject *input,
9318 PyObject *mapping,
9319 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009322 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 Py_ssize_t size, i;
9324 int kind;
9325 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009326 _PyUnicodeWriter writer;
9327 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009328 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009329 PyObject *errorHandler = NULL;
9330 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009331 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009332 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009333
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009335 PyErr_BadArgument();
9336 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 if (PyUnicode_READY(input) == -1)
9340 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009341 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 kind = PyUnicode_KIND(input);
9343 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009345 if (size == 0)
9346 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009348 /* allocate enough for a simple 1:1 translation without
9349 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009350 _PyUnicodeWriter_Init(&writer);
9351 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009352 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353
Victor Stinner872b2912014-04-05 14:27:07 +02009354 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9355
Victor Stinner33798672016-03-01 21:59:58 +01009356 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009357 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009358 if (PyUnicode_IS_ASCII(input)) {
9359 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9360 if (res < 0) {
9361 _PyUnicodeWriter_Dealloc(&writer);
9362 return NULL;
9363 }
9364 if (res == 1)
9365 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009366 }
Victor Stinner33798672016-03-01 21:59:58 +01009367 else {
9368 i = 0;
9369 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009373 int translate;
9374 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9375 Py_ssize_t newpos;
9376 /* startpos for collecting untranslatable chars */
9377 Py_ssize_t collstart;
9378 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009379 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380
Victor Stinner1194ea02014-04-04 19:37:40 +02009381 ch = PyUnicode_READ(kind, data, i);
9382 translate = charmaptranslate_output(ch, mapping, &writer);
9383 if (translate < 0)
9384 goto onError;
9385
9386 if (translate != 0) {
9387 /* it worked => adjust input pointer */
9388 ++i;
9389 continue;
9390 }
9391
9392 /* untranslatable character */
9393 collstart = i;
9394 collend = i+1;
9395
9396 /* find all untranslatable characters */
9397 while (collend < size) {
9398 PyObject *x;
9399 ch = PyUnicode_READ(kind, data, collend);
9400 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009401 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009402 Py_XDECREF(x);
9403 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009404 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009405 ++collend;
9406 }
9407
9408 if (ignore) {
9409 i = collend;
9410 }
9411 else {
9412 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9413 reason, input, &exc,
9414 collstart, collend, &newpos);
9415 if (repunicode == NULL)
9416 goto onError;
9417 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009418 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009419 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009420 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009421 Py_DECREF(repunicode);
9422 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009423 }
9424 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009425 Py_XDECREF(exc);
9426 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009427 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009430 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009431 Py_XDECREF(exc);
9432 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009433 return NULL;
9434}
9435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436/* Deprecated. Use PyUnicode_Translate instead. */
9437PyObject *
9438PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9439 Py_ssize_t size,
9440 PyObject *mapping,
9441 const char *errors)
9442{
Christian Heimes5f520f42012-09-11 14:03:25 +02009443 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009444 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 if (!unicode)
9446 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009447 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9448 Py_DECREF(unicode);
9449 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450}
9451
Alexander Belopolsky40018472011-02-26 01:02:56 +00009452PyObject *
9453PyUnicode_Translate(PyObject *str,
9454 PyObject *mapping,
9455 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009457 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009458 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009459 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460}
Tim Petersced69f82003-09-16 20:30:58 +00009461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462PyObject *
9463_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9464{
9465 if (!PyUnicode_Check(unicode)) {
9466 PyErr_BadInternalCall();
9467 return NULL;
9468 }
9469 if (PyUnicode_READY(unicode) == -1)
9470 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009471 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 /* If the string is already ASCII, just return the same string */
9473 Py_INCREF(unicode);
9474 return unicode;
9475 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009476
9477 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9478 PyObject *result = PyUnicode_New(len, 127);
9479 if (result == NULL) {
9480 return NULL;
9481 }
9482
9483 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9484 int kind = PyUnicode_KIND(unicode);
9485 const void *data = PyUnicode_DATA(unicode);
9486 Py_ssize_t i;
9487 for (i = 0; i < len; ++i) {
9488 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9489 if (ch < 127) {
9490 out[i] = ch;
9491 }
9492 else if (Py_UNICODE_ISSPACE(ch)) {
9493 out[i] = ' ';
9494 }
9495 else {
9496 int decimal = Py_UNICODE_TODECIMAL(ch);
9497 if (decimal < 0) {
9498 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009499 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009500 _PyUnicode_LENGTH(result) = i + 1;
9501 break;
9502 }
9503 out[i] = '0' + decimal;
9504 }
9505 }
9506
INADA Naoki16dfca42018-07-14 12:06:43 +09009507 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009508 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509}
9510
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009511PyObject *
9512PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9513 Py_ssize_t length)
9514{
Victor Stinnerf0124502011-11-21 23:12:56 +01009515 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009516 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009517 Py_UCS4 maxchar;
9518 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009519 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009520
Victor Stinner99d7ad02012-02-22 13:37:39 +01009521 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009522 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009523 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009524 if (ch > 127) {
9525 int decimal = Py_UNICODE_TODECIMAL(ch);
9526 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009527 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009528 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009529 }
9530 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009531
9532 /* Copy to a new string */
9533 decimal = PyUnicode_New(length, maxchar);
9534 if (decimal == NULL)
9535 return decimal;
9536 kind = PyUnicode_KIND(decimal);
9537 data = PyUnicode_DATA(decimal);
9538 /* Iterate over code points */
9539 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009540 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009541 if (ch > 127) {
9542 int decimal = Py_UNICODE_TODECIMAL(ch);
9543 if (decimal >= 0)
9544 ch = '0' + decimal;
9545 }
9546 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009548 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009549}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009550/* --- Decimal Encoder ---------------------------------------------------- */
9551
Alexander Belopolsky40018472011-02-26 01:02:56 +00009552int
9553PyUnicode_EncodeDecimal(Py_UNICODE *s,
9554 Py_ssize_t length,
9555 char *output,
9556 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009557{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009558 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009559 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009560 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009561 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009562
9563 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009564 PyErr_BadArgument();
9565 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009566 }
9567
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009568 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009569 if (unicode == NULL)
9570 return -1;
9571
Victor Stinner42bf7752011-11-21 22:52:58 +01009572 kind = PyUnicode_KIND(unicode);
9573 data = PyUnicode_DATA(unicode);
9574
Victor Stinnerb84d7232011-11-22 01:50:07 +01009575 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009576 PyObject *exc;
9577 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009578 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009579 Py_ssize_t startpos;
9580
9581 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009582
Benjamin Peterson29060642009-01-31 22:14:21 +00009583 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009584 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009585 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009587 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009588 decimal = Py_UNICODE_TODECIMAL(ch);
9589 if (decimal >= 0) {
9590 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009591 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009592 continue;
9593 }
9594 if (0 < ch && ch < 256) {
9595 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009596 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009597 continue;
9598 }
Victor Stinner6345be92011-11-25 20:09:01 +01009599
Victor Stinner42bf7752011-11-21 22:52:58 +01009600 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009601 exc = NULL;
9602 raise_encode_exception(&exc, "decimal", unicode,
9603 startpos, startpos+1,
9604 "invalid decimal Unicode string");
9605 Py_XDECREF(exc);
9606 Py_DECREF(unicode);
9607 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009608 }
9609 /* 0-terminate the output string */
9610 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009611 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009612 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009613}
9614
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615/* --- Helpers ------------------------------------------------------------ */
9616
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009617/* helper macro to fixup start/end slice values */
9618#define ADJUST_INDICES(start, end, len) \
9619 if (end > len) \
9620 end = len; \
9621 else if (end < 0) { \
9622 end += len; \
9623 if (end < 0) \
9624 end = 0; \
9625 } \
9626 if (start < 0) { \
9627 start += len; \
9628 if (start < 0) \
9629 start = 0; \
9630 }
9631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009633any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009635 Py_ssize_t end,
9636 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009638 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009639 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 Py_ssize_t len1, len2, result;
9641
9642 kind1 = PyUnicode_KIND(s1);
9643 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009644 if (kind1 < kind2)
9645 return -1;
9646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 len1 = PyUnicode_GET_LENGTH(s1);
9648 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009649 ADJUST_INDICES(start, end, len1);
9650 if (end - start < len2)
9651 return -1;
9652
9653 buf1 = PyUnicode_DATA(s1);
9654 buf2 = PyUnicode_DATA(s2);
9655 if (len2 == 1) {
9656 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9657 result = findchar((const char *)buf1 + kind1*start,
9658 kind1, end - start, ch, direction);
9659 if (result == -1)
9660 return -1;
9661 else
9662 return start + result;
9663 }
9664
9665 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009666 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009667 if (!buf2)
9668 return -2;
9669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670
Victor Stinner794d5672011-10-10 03:21:36 +02009671 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009672 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009673 case PyUnicode_1BYTE_KIND:
9674 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9675 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9676 else
9677 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9678 break;
9679 case PyUnicode_2BYTE_KIND:
9680 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9681 break;
9682 case PyUnicode_4BYTE_KIND:
9683 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9684 break;
9685 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009686 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009687 }
9688 }
9689 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009690 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009691 case PyUnicode_1BYTE_KIND:
9692 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9693 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9694 else
9695 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9696 break;
9697 case PyUnicode_2BYTE_KIND:
9698 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9699 break;
9700 case PyUnicode_4BYTE_KIND:
9701 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9702 break;
9703 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009704 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 }
9707
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009708 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009709 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009710 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711
9712 return result;
9713}
9714
Victor Stinner59423e32018-11-26 13:40:01 +01009715/* _PyUnicode_InsertThousandsGrouping() helper functions */
9716#include "stringlib/localeutil.h"
9717
9718/**
9719 * InsertThousandsGrouping:
9720 * @writer: Unicode writer.
9721 * @n_buffer: Number of characters in @buffer.
9722 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9723 * @d_pos: Start of digits string.
9724 * @n_digits: The number of digits in the string, in which we want
9725 * to put the grouping chars.
9726 * @min_width: The minimum width of the digits in the output string.
9727 * Output will be zero-padded on the left to fill.
9728 * @grouping: see definition in localeconv().
9729 * @thousands_sep: see definition in localeconv().
9730 *
9731 * There are 2 modes: counting and filling. If @writer is NULL,
9732 * we are in counting mode, else filling mode.
9733 * If counting, the required buffer size is returned.
9734 * If filling, we know the buffer will be large enough, so we don't
9735 * need to pass in the buffer size.
9736 * Inserts thousand grouping characters (as defined by grouping and
9737 * thousands_sep) into @writer.
9738 *
9739 * Return value: -1 on error, number of characters otherwise.
9740 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009742_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009743 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009744 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009745 PyObject *digits,
9746 Py_ssize_t d_pos,
9747 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009748 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009749 const char *grouping,
9750 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009751 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752{
Xtreak3f7983a2019-01-07 20:39:14 +05309753 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009754 if (writer) {
9755 assert(digits != NULL);
9756 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009757 }
9758 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009759 assert(digits == NULL);
9760 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009761 }
Victor Stinner59423e32018-11-26 13:40:01 +01009762 assert(0 <= d_pos);
9763 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009764 assert(grouping != NULL);
9765
9766 if (digits != NULL) {
9767 if (PyUnicode_READY(digits) == -1) {
9768 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009769 }
Victor Stinner59423e32018-11-26 13:40:01 +01009770 }
9771 if (PyUnicode_READY(thousands_sep) == -1) {
9772 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009773 }
9774
Victor Stinner59423e32018-11-26 13:40:01 +01009775 Py_ssize_t count = 0;
9776 Py_ssize_t n_zeros;
9777 int loop_broken = 0;
9778 int use_separator = 0; /* First time through, don't append the
9779 separator. They only go between
9780 groups. */
9781 Py_ssize_t buffer_pos;
9782 Py_ssize_t digits_pos;
9783 Py_ssize_t len;
9784 Py_ssize_t n_chars;
9785 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9786 be looked at */
9787 /* A generator that returns all of the grouping widths, until it
9788 returns 0. */
9789 GroupGenerator groupgen;
9790 GroupGenerator_init(&groupgen, grouping);
9791 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9792
9793 /* if digits are not grouped, thousands separator
9794 should be an empty string */
9795 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9796
9797 digits_pos = d_pos + n_digits;
9798 if (writer) {
9799 buffer_pos = writer->pos + n_buffer;
9800 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9801 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 }
Victor Stinner59423e32018-11-26 13:40:01 +01009803 else {
9804 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009805 }
Victor Stinner59423e32018-11-26 13:40:01 +01009806
9807 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009808 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009809 }
Victor Stinner59423e32018-11-26 13:40:01 +01009810
9811 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9812 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9813 n_zeros = Py_MAX(0, len - remaining);
9814 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9815
9816 /* Use n_zero zero's and n_chars chars */
9817
9818 /* Count only, don't do anything. */
9819 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9820
9821 /* Copy into the writer. */
9822 InsertThousandsGrouping_fill(writer, &buffer_pos,
9823 digits, &digits_pos,
9824 n_chars, n_zeros,
9825 use_separator ? thousands_sep : NULL,
9826 thousands_sep_len, maxchar);
9827
9828 /* Use a separator next time. */
9829 use_separator = 1;
9830
9831 remaining -= n_chars;
9832 min_width -= len;
9833
9834 if (remaining <= 0 && min_width <= 0) {
9835 loop_broken = 1;
9836 break;
9837 }
9838 min_width -= thousands_sep_len;
9839 }
9840 if (!loop_broken) {
9841 /* We left the loop without using a break statement. */
9842
9843 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9844 n_zeros = Py_MAX(0, len - remaining);
9845 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9846
9847 /* Use n_zero zero's and n_chars chars */
9848 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9849
9850 /* Copy into the writer. */
9851 InsertThousandsGrouping_fill(writer, &buffer_pos,
9852 digits, &digits_pos,
9853 n_chars, n_zeros,
9854 use_separator ? thousands_sep : NULL,
9855 thousands_sep_len, maxchar);
9856 }
9857 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858}
9859
9860
Alexander Belopolsky40018472011-02-26 01:02:56 +00009861Py_ssize_t
9862PyUnicode_Count(PyObject *str,
9863 PyObject *substr,
9864 Py_ssize_t start,
9865 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009867 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009868 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009869 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009871
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009872 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009873 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009874
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009875 kind1 = PyUnicode_KIND(str);
9876 kind2 = PyUnicode_KIND(substr);
9877 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009878 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009879
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009880 len1 = PyUnicode_GET_LENGTH(str);
9881 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009883 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009884 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009885
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009886 buf1 = PyUnicode_DATA(str);
9887 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009888 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009889 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009890 if (!buf2)
9891 goto onError;
9892 }
9893
9894 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009896 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009897 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009898 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009899 buf2, len2, PY_SSIZE_T_MAX
9900 );
9901 else
9902 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009903 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009904 buf2, len2, PY_SSIZE_T_MAX
9905 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 break;
9907 case PyUnicode_2BYTE_KIND:
9908 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009909 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 buf2, len2, PY_SSIZE_T_MAX
9911 );
9912 break;
9913 case PyUnicode_4BYTE_KIND:
9914 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009915 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 buf2, len2, PY_SSIZE_T_MAX
9917 );
9918 break;
9919 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009920 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009922
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009923 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009924 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009925 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009929 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9930 if (kind2 != kind1)
9931 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933}
9934
Alexander Belopolsky40018472011-02-26 01:02:56 +00009935Py_ssize_t
9936PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009937 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009938 Py_ssize_t start,
9939 Py_ssize_t end,
9940 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009942 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009943 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009944
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009945 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946}
9947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948Py_ssize_t
9949PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9950 Py_ssize_t start, Py_ssize_t end,
9951 int direction)
9952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009954 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 if (PyUnicode_READY(str) == -1)
9956 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009957 len = PyUnicode_GET_LENGTH(str);
9958 ADJUST_INDICES(start, end, len);
9959 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009960 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009962 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9963 kind, end-start, ch, direction);
9964 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009966 else
9967 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968}
9969
Alexander Belopolsky40018472011-02-26 01:02:56 +00009970static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009971tailmatch(PyObject *self,
9972 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009973 Py_ssize_t start,
9974 Py_ssize_t end,
9975 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 int kind_self;
9978 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009979 const void *data_self;
9980 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 Py_ssize_t offset;
9982 Py_ssize_t i;
9983 Py_ssize_t end_sub;
9984
9985 if (PyUnicode_READY(self) == -1 ||
9986 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009987 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9990 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009992 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009994 if (PyUnicode_GET_LENGTH(substring) == 0)
9995 return 1;
9996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 kind_self = PyUnicode_KIND(self);
9998 data_self = PyUnicode_DATA(self);
9999 kind_sub = PyUnicode_KIND(substring);
10000 data_sub = PyUnicode_DATA(substring);
10001 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
10002
10003 if (direction > 0)
10004 offset = end;
10005 else
10006 offset = start;
10007
10008 if (PyUnicode_READ(kind_self, data_self, offset) ==
10009 PyUnicode_READ(kind_sub, data_sub, 0) &&
10010 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
10011 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
10012 /* If both are of the same kind, memcmp is sufficient */
10013 if (kind_self == kind_sub) {
10014 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010015 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 data_sub,
10017 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010018 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 }
Martin Pantere26da7c2016-06-02 10:07:09 +000010020 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 else {
10022 /* We do not need to compare 0 and len(substring)-1 because
10023 the if statement above ensured already that they are equal
10024 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 for (i = 1; i < end_sub; ++i) {
10026 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10027 PyUnicode_READ(kind_sub, data_sub, i))
10028 return 0;
10029 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010030 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032 }
10033
10034 return 0;
10035}
10036
Alexander Belopolsky40018472011-02-26 01:02:56 +000010037Py_ssize_t
10038PyUnicode_Tailmatch(PyObject *str,
10039 PyObject *substr,
10040 Py_ssize_t start,
10041 Py_ssize_t end,
10042 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010044 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010045 return -1;
Tim Petersced69f82003-09-16 20:30:58 +000010046
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010047 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010048}
10049
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010050static PyObject *
10051ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010052{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010053 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010054 const char *data = PyUnicode_DATA(self);
10055 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010056 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +000010057
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010058 res = PyUnicode_New(len, 127);
10059 if (res == NULL)
10060 return NULL;
10061 resdata = PyUnicode_DATA(res);
10062 if (lower)
10063 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010065 _Py_bytes_upper(resdata, data, len);
10066 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067}
10068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010070handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010071{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010072 Py_ssize_t j;
10073 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010010074 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010075 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010076
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010077 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10078
10079 where ! is a negation and \p{xxx} is a character with property xxx.
10080 */
10081 for (j = i - 1; j >= 0; j--) {
10082 c = PyUnicode_READ(kind, data, j);
10083 if (!_PyUnicode_IsCaseIgnorable(c))
10084 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010085 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010086 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10087 if (final_sigma) {
10088 for (j = i + 1; j < length; j++) {
10089 c = PyUnicode_READ(kind, data, j);
10090 if (!_PyUnicode_IsCaseIgnorable(c))
10091 break;
10092 }
10093 final_sigma = j == length || !_PyUnicode_IsCased(c);
10094 }
10095 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096}
10097
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010098static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010099lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010100 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010101{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010102 /* Obscure special case. */
10103 if (c == 0x3A3) {
10104 mapped[0] = handle_capital_sigma(kind, data, length, i);
10105 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010107 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108}
10109
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010110static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010111do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010113 Py_ssize_t i, k = 0;
10114 int n_res, j;
10115 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010116
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010117 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010118 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010119 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010120 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010121 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010122 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010123 for (i = 1; i < length; i++) {
10124 c = PyUnicode_READ(kind, data, i);
10125 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10126 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010127 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010128 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010129 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010130 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010131 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132}
10133
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010134static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010135do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010136 Py_ssize_t i, k = 0;
10137
10138 for (i = 0; i < length; i++) {
10139 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10140 int n_res, j;
10141 if (Py_UNICODE_ISUPPER(c)) {
10142 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10143 }
10144 else if (Py_UNICODE_ISLOWER(c)) {
10145 n_res = _PyUnicode_ToUpperFull(c, mapped);
10146 }
10147 else {
10148 n_res = 1;
10149 mapped[0] = c;
10150 }
10151 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010152 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010153 res[k++] = mapped[j];
10154 }
10155 }
10156 return k;
10157}
10158
10159static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010160do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010161 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010163 Py_ssize_t i, k = 0;
10164
10165 for (i = 0; i < length; i++) {
10166 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10167 int n_res, j;
10168 if (lower)
10169 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10170 else
10171 n_res = _PyUnicode_ToUpperFull(c, mapped);
10172 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010173 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010174 res[k++] = mapped[j];
10175 }
10176 }
10177 return k;
10178}
10179
10180static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010181do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010182{
10183 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10184}
10185
10186static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010187do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010188{
10189 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10190}
10191
Benjamin Petersone51757f2012-01-12 21:10:29 -050010192static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010193do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010194{
10195 Py_ssize_t i, k = 0;
10196
10197 for (i = 0; i < length; i++) {
10198 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10199 Py_UCS4 mapped[3];
10200 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10201 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010202 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010203 res[k++] = mapped[j];
10204 }
10205 }
10206 return k;
10207}
10208
10209static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010210do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010211{
10212 Py_ssize_t i, k = 0;
10213 int previous_is_cased;
10214
10215 previous_is_cased = 0;
10216 for (i = 0; i < length; i++) {
10217 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10218 Py_UCS4 mapped[3];
10219 int n_res, j;
10220
10221 if (previous_is_cased)
10222 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10223 else
10224 n_res = _PyUnicode_ToTitleFull(c, mapped);
10225
10226 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010227 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010228 res[k++] = mapped[j];
10229 }
10230
10231 previous_is_cased = _PyUnicode_IsCased(c);
10232 }
10233 return k;
10234}
10235
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010236static PyObject *
10237case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010238 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010239{
10240 PyObject *res = NULL;
10241 Py_ssize_t length, newlength = 0;
10242 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010243 const void *data;
10244 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010245 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10246
Benjamin Petersoneea48462012-01-16 14:28:50 -050010247 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010248
10249 kind = PyUnicode_KIND(self);
10250 data = PyUnicode_DATA(self);
10251 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010252 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010253 PyErr_SetString(PyExc_OverflowError, "string is too long");
10254 return NULL;
10255 }
Victor Stinner00d7abd2020-12-01 09:56:42 +010010256 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010257 if (tmp == NULL)
10258 return PyErr_NoMemory();
10259 newlength = perform(kind, data, length, tmp, &maxchar);
10260 res = PyUnicode_New(newlength, maxchar);
10261 if (res == NULL)
10262 goto leave;
10263 tmpend = tmp + newlength;
10264 outdata = PyUnicode_DATA(res);
10265 outkind = PyUnicode_KIND(res);
10266 switch (outkind) {
10267 case PyUnicode_1BYTE_KIND:
10268 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10269 break;
10270 case PyUnicode_2BYTE_KIND:
10271 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10272 break;
10273 case PyUnicode_4BYTE_KIND:
10274 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10275 break;
10276 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010277 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010278 }
10279 leave:
Victor Stinner00d7abd2020-12-01 09:56:42 +010010280 PyMem_Free(tmp);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010281 return res;
10282}
10283
Tim Peters8ce9f162004-08-27 01:49:32 +000010284PyObject *
10285PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010287 PyObject *res;
10288 PyObject *fseq;
10289 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010290 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010292 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010293 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010294 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010295 }
10296
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010297 /* NOTE: the following code can't call back into Python code,
10298 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010299 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010300
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010301 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010302 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010303 res = _PyUnicode_JoinArray(separator, items, seqlen);
10304 Py_DECREF(fseq);
10305 return res;
10306}
10307
10308PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010309_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010310{
10311 PyObject *res = NULL; /* the result */
10312 PyObject *sep = NULL;
10313 Py_ssize_t seplen;
10314 PyObject *item;
10315 Py_ssize_t sz, i, res_offset;
10316 Py_UCS4 maxchar;
10317 Py_UCS4 item_maxchar;
10318 int use_memcpy;
10319 unsigned char *res_data = NULL, *sep_data = NULL;
10320 PyObject *last_obj;
10321 unsigned int kind = 0;
10322
Tim Peters05eba1f2004-08-27 21:32:02 +000010323 /* If empty sequence, return u"". */
10324 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010325 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010326 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010327
Tim Peters05eba1f2004-08-27 21:32:02 +000010328 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010329 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010330 if (seqlen == 1) {
10331 if (PyUnicode_CheckExact(items[0])) {
10332 res = items[0];
10333 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010334 return res;
10335 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010336 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010337 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010338 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010339 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010340 /* Set up sep and seplen */
10341 if (separator == NULL) {
10342 /* fall back to a blank space separator */
10343 sep = PyUnicode_FromOrdinal(' ');
10344 if (!sep)
10345 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010346 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010347 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010348 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010349 else {
10350 if (!PyUnicode_Check(separator)) {
10351 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010352 "separator: expected str instance,"
10353 " %.80s found",
10354 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010355 goto onError;
10356 }
10357 if (PyUnicode_READY(separator))
10358 goto onError;
10359 sep = separator;
10360 seplen = PyUnicode_GET_LENGTH(separator);
10361 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10362 /* inc refcount to keep this code path symmetric with the
10363 above case of a blank separator */
10364 Py_INCREF(sep);
10365 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010366 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010367 }
10368
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010369 /* There are at least two things to join, or else we have a subclass
10370 * of str in the sequence.
10371 * Do a pre-pass to figure out the total amount of space we'll
10372 * need (sz), and see whether all argument are strings.
10373 */
10374 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010375#ifdef Py_DEBUG
10376 use_memcpy = 0;
10377#else
10378 use_memcpy = 1;
10379#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010380 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010381 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010382 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010383 if (!PyUnicode_Check(item)) {
10384 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010385 "sequence item %zd: expected str instance,"
10386 " %.80s found",
10387 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010388 goto onError;
10389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 if (PyUnicode_READY(item) == -1)
10391 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010392 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010394 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010395 if (i != 0) {
10396 add_sz += seplen;
10397 }
10398 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010399 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010400 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010401 goto onError;
10402 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010403 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010404 if (use_memcpy && last_obj != NULL) {
10405 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10406 use_memcpy = 0;
10407 }
10408 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010409 }
Tim Petersced69f82003-09-16 20:30:58 +000010410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010412 if (res == NULL)
10413 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010414
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010415 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010416#ifdef Py_DEBUG
10417 use_memcpy = 0;
10418#else
10419 if (use_memcpy) {
10420 res_data = PyUnicode_1BYTE_DATA(res);
10421 kind = PyUnicode_KIND(res);
10422 if (seplen != 0)
10423 sep_data = PyUnicode_1BYTE_DATA(sep);
10424 }
10425#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010426 if (use_memcpy) {
10427 for (i = 0; i < seqlen; ++i) {
10428 Py_ssize_t itemlen;
10429 item = items[i];
10430
10431 /* Copy item, and maybe the separator. */
10432 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010433 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010434 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010435 kind * seplen);
10436 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010437 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010438
10439 itemlen = PyUnicode_GET_LENGTH(item);
10440 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010441 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010442 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010443 kind * itemlen);
10444 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010445 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010446 }
10447 assert(res_data == PyUnicode_1BYTE_DATA(res)
10448 + kind * PyUnicode_GET_LENGTH(res));
10449 }
10450 else {
10451 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10452 Py_ssize_t itemlen;
10453 item = items[i];
10454
10455 /* Copy item, and maybe the separator. */
10456 if (i && seplen != 0) {
10457 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10458 res_offset += seplen;
10459 }
10460
10461 itemlen = PyUnicode_GET_LENGTH(item);
10462 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010463 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010464 res_offset += itemlen;
10465 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010466 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010467 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010468 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010471 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473
Benjamin Peterson29060642009-01-31 22:14:21 +000010474 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010476 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477 return NULL;
10478}
10479
Victor Stinnerd3f08822012-05-29 12:57:52 +020010480void
10481_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10482 Py_UCS4 fill_char)
10483{
10484 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010485 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010486 assert(PyUnicode_IS_READY(unicode));
10487 assert(unicode_modifiable(unicode));
10488 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10489 assert(start >= 0);
10490 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010491 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010492}
10493
Victor Stinner3fe55312012-01-04 00:33:50 +010010494Py_ssize_t
10495PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10496 Py_UCS4 fill_char)
10497{
10498 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010499
10500 if (!PyUnicode_Check(unicode)) {
10501 PyErr_BadInternalCall();
10502 return -1;
10503 }
10504 if (PyUnicode_READY(unicode) == -1)
10505 return -1;
10506 if (unicode_check_modifiable(unicode))
10507 return -1;
10508
Victor Stinnerd3f08822012-05-29 12:57:52 +020010509 if (start < 0) {
10510 PyErr_SetString(PyExc_IndexError, "string index out of range");
10511 return -1;
10512 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010513 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10514 PyErr_SetString(PyExc_ValueError,
10515 "fill character is bigger than "
10516 "the string maximum character");
10517 return -1;
10518 }
10519
10520 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10521 length = Py_MIN(maxlen, length);
10522 if (length <= 0)
10523 return 0;
10524
Victor Stinnerd3f08822012-05-29 12:57:52 +020010525 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010526 return length;
10527}
10528
Victor Stinner9310abb2011-10-05 00:59:23 +020010529static PyObject *
10530pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010531 Py_ssize_t left,
10532 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 PyObject *u;
10536 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010537 int kind;
10538 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539
10540 if (left < 0)
10541 left = 0;
10542 if (right < 0)
10543 right = 0;
10544
Victor Stinnerc4b49542011-12-11 22:44:26 +010010545 if (left == 0 && right == 0)
10546 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10549 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010550 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10551 return NULL;
10552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010554 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010556 if (!u)
10557 return NULL;
10558
10559 kind = PyUnicode_KIND(u);
10560 data = PyUnicode_DATA(u);
10561 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010562 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010563 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010564 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010565 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010566 assert(_PyUnicode_CheckConsistency(u, 1));
10567 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568}
10569
Alexander Belopolsky40018472011-02-26 01:02:56 +000010570PyObject *
10571PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010573 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010575 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577
Benjamin Petersonead6b532011-12-20 17:23:42 -060010578 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010580 if (PyUnicode_IS_ASCII(string))
10581 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010582 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010583 PyUnicode_GET_LENGTH(string), keepends);
10584 else
10585 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010586 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010587 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 break;
10589 case PyUnicode_2BYTE_KIND:
10590 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010591 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 PyUnicode_GET_LENGTH(string), keepends);
10593 break;
10594 case PyUnicode_4BYTE_KIND:
10595 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010596 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 PyUnicode_GET_LENGTH(string), keepends);
10598 break;
10599 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010600 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603}
10604
Alexander Belopolsky40018472011-02-26 01:02:56 +000010605static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010606split(PyObject *self,
10607 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010608 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010610 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010611 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 Py_ssize_t len1, len2;
10613 PyObject* out;
10614
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010616 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (PyUnicode_READY(self) == -1)
10619 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010622 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010624 if (PyUnicode_IS_ASCII(self))
10625 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010626 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010627 PyUnicode_GET_LENGTH(self), maxcount
10628 );
10629 else
10630 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010631 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010632 PyUnicode_GET_LENGTH(self), maxcount
10633 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 case PyUnicode_2BYTE_KIND:
10635 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010636 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 PyUnicode_GET_LENGTH(self), maxcount
10638 );
10639 case PyUnicode_4BYTE_KIND:
10640 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010641 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 PyUnicode_GET_LENGTH(self), maxcount
10643 );
10644 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010645 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 }
10647
10648 if (PyUnicode_READY(substring) == -1)
10649 return NULL;
10650
10651 kind1 = PyUnicode_KIND(self);
10652 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 len1 = PyUnicode_GET_LENGTH(self);
10654 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010655 if (kind1 < kind2 || len1 < len2) {
10656 out = PyList_New(1);
10657 if (out == NULL)
10658 return NULL;
10659 Py_INCREF(self);
10660 PyList_SET_ITEM(out, 0, self);
10661 return out;
10662 }
10663 buf1 = PyUnicode_DATA(self);
10664 buf2 = PyUnicode_DATA(substring);
10665 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010666 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010667 if (!buf2)
10668 return NULL;
10669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010671 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010673 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10674 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010675 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010676 else
10677 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010678 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 break;
10680 case PyUnicode_2BYTE_KIND:
10681 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010682 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 break;
10684 case PyUnicode_4BYTE_KIND:
10685 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010686 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 break;
10688 default:
10689 out = NULL;
10690 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010691 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010692 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010693 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695}
10696
Alexander Belopolsky40018472011-02-26 01:02:56 +000010697static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010698rsplit(PyObject *self,
10699 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010700 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010701{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010702 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010703 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 Py_ssize_t len1, len2;
10705 PyObject* out;
10706
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010707 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010708 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 if (PyUnicode_READY(self) == -1)
10711 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010714 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010716 if (PyUnicode_IS_ASCII(self))
10717 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010718 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010719 PyUnicode_GET_LENGTH(self), maxcount
10720 );
10721 else
10722 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010723 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010724 PyUnicode_GET_LENGTH(self), maxcount
10725 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 case PyUnicode_2BYTE_KIND:
10727 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010728 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 PyUnicode_GET_LENGTH(self), maxcount
10730 );
10731 case PyUnicode_4BYTE_KIND:
10732 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010733 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 PyUnicode_GET_LENGTH(self), maxcount
10735 );
10736 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010737 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 }
10739
10740 if (PyUnicode_READY(substring) == -1)
10741 return NULL;
10742
10743 kind1 = PyUnicode_KIND(self);
10744 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 len1 = PyUnicode_GET_LENGTH(self);
10746 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010747 if (kind1 < kind2 || len1 < len2) {
10748 out = PyList_New(1);
10749 if (out == NULL)
10750 return NULL;
10751 Py_INCREF(self);
10752 PyList_SET_ITEM(out, 0, self);
10753 return out;
10754 }
10755 buf1 = PyUnicode_DATA(self);
10756 buf2 = PyUnicode_DATA(substring);
10757 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010758 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010759 if (!buf2)
10760 return NULL;
10761 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010763 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010765 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10766 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010767 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010768 else
10769 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010770 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 break;
10772 case PyUnicode_2BYTE_KIND:
10773 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010774 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 break;
10776 case PyUnicode_4BYTE_KIND:
10777 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010778 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 break;
10780 default:
10781 out = NULL;
10782 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010783 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010784 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010785 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 return out;
10787}
10788
10789static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010790anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10791 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010793 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010795 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10796 return asciilib_find(buf1, len1, buf2, len2, offset);
10797 else
10798 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 case PyUnicode_2BYTE_KIND:
10800 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10801 case PyUnicode_4BYTE_KIND:
10802 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10803 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010804 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805}
10806
10807static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010808anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10809 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010811 switch (kind) {
10812 case PyUnicode_1BYTE_KIND:
10813 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10814 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10815 else
10816 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10817 case PyUnicode_2BYTE_KIND:
10818 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10819 case PyUnicode_4BYTE_KIND:
10820 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10821 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010822 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010823}
10824
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010825static void
10826replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10827 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10828{
10829 int kind = PyUnicode_KIND(u);
10830 void *data = PyUnicode_DATA(u);
10831 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10832 if (kind == PyUnicode_1BYTE_KIND) {
10833 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10834 (Py_UCS1 *)data + len,
10835 u1, u2, maxcount);
10836 }
10837 else if (kind == PyUnicode_2BYTE_KIND) {
10838 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10839 (Py_UCS2 *)data + len,
10840 u1, u2, maxcount);
10841 }
10842 else {
10843 assert(kind == PyUnicode_4BYTE_KIND);
10844 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10845 (Py_UCS4 *)data + len,
10846 u1, u2, maxcount);
10847 }
10848}
10849
Alexander Belopolsky40018472011-02-26 01:02:56 +000010850static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851replace(PyObject *self, PyObject *str1,
10852 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010855 const char *sbuf = PyUnicode_DATA(self);
10856 const void *buf1 = PyUnicode_DATA(str1);
10857 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 int srelease = 0, release1 = 0, release2 = 0;
10859 int skind = PyUnicode_KIND(self);
10860 int kind1 = PyUnicode_KIND(str1);
10861 int kind2 = PyUnicode_KIND(str2);
10862 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10863 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10864 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010865 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010866 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010868 if (slen < len1)
10869 goto nothing;
10870
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010872 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010873 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010874 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875
Victor Stinner59de0ee2011-10-07 10:01:28 +020010876 if (str1 == str2)
10877 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878
Victor Stinner49a0a212011-10-12 23:46:10 +020010879 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010880 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10881 if (maxchar < maxchar_str1)
10882 /* substring too wide to be present */
10883 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010884 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10885 /* Replacing str1 with str2 may cause a maxchar reduction in the
10886 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010887 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010888 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010891 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010893 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010895 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010896 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010897 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010898
Victor Stinner69ed0f42013-04-09 21:48:24 +020010899 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010900 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010901 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010902 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010903 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010905 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010907
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010908 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10909 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010910 }
10911 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 int rkind = skind;
10913 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010914 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 if (kind1 < rkind) {
10917 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010918 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 if (!buf1) goto error;
10920 release1 = 1;
10921 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010922 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010923 if (i < 0)
10924 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 if (rkind > kind2) {
10926 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010927 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 if (!buf2) goto error;
10929 release2 = 1;
10930 }
10931 else if (rkind < kind2) {
10932 /* widen self and buf1 */
10933 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010934 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010935 assert(buf1 != PyUnicode_DATA(str1));
10936 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010937 buf1 = PyUnicode_DATA(str1);
10938 release1 = 0;
10939 }
10940 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 if (!sbuf) goto error;
10942 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010943 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 if (!buf1) goto error;
10945 release1 = 1;
10946 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010947 u = PyUnicode_New(slen, maxchar);
10948 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010950 assert(PyUnicode_KIND(u) == rkind);
10951 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010952
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010953 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010954 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010955 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010957 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010959
10960 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010961 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010962 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010963 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010964 if (i == -1)
10965 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010966 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010968 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010972 }
10973 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010975 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 int rkind = skind;
10977 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010980 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010981 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (!buf1) goto error;
10983 release1 = 1;
10984 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010985 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010986 if (n == 0)
10987 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010989 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010990 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 if (!buf2) goto error;
10992 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010995 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010997 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 if (!sbuf) goto error;
10999 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011000 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011001 assert(buf1 != PyUnicode_DATA(str1));
11002 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011003 buf1 = PyUnicode_DATA(str1);
11004 release1 = 0;
11005 }
11006 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 if (!buf1) goto error;
11008 release1 = 1;
11009 }
11010 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
11011 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011012 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 PyErr_SetString(PyExc_OverflowError,
11014 "replace string is too long");
11015 goto error;
11016 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010011017 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020011018 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020011019 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020011020 goto done;
11021 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080011022 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 PyErr_SetString(PyExc_OverflowError,
11024 "replace string is too long");
11025 goto error;
11026 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011027 u = PyUnicode_New(new_size, maxchar);
11028 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020011030 assert(PyUnicode_KIND(u) == rkind);
11031 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032 ires = i = 0;
11033 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011034 while (n-- > 0) {
11035 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020011036 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011037 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020011038 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011039 if (j == -1)
11040 break;
11041 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011042 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011043 memcpy(res + rkind * ires,
11044 sbuf + rkind * i,
11045 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011047 }
11048 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011050 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011052 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011058 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011059 memcpy(res + rkind * ires,
11060 sbuf + rkind * i,
11061 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020011062 }
11063 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011064 /* interleave */
11065 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011066 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011068 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011070 if (--n <= 0)
11071 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011072 memcpy(res + rkind * ires,
11073 sbuf + rkind * i,
11074 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 ires++;
11076 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011077 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011078 memcpy(res + rkind * ires,
11079 sbuf + rkind * i,
11080 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011081 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011082 }
11083
11084 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011085 unicode_adjust_maxchar(&u);
11086 if (u == NULL)
11087 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011089
11090 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011091 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11092 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11093 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011095 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011097 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011099 PyMem_Free((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011100 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011102
Benjamin Peterson29060642009-01-31 22:14:21 +000011103 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011104 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011105 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11106 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11107 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011108 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011109 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011111 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011113 PyMem_Free((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011114 return unicode_result_unchanged(self);
11115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011117 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11118 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11119 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11120 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011121 PyMem_Free((void *)sbuf);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011122 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011123 PyMem_Free((void *)buf1);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011124 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011125 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127}
11128
11129/* --- Unicode Object Methods --------------------------------------------- */
11130
INADA Naoki3ae20562017-01-16 20:41:20 +090011131/*[clinic input]
11132str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133
INADA Naoki3ae20562017-01-16 20:41:20 +090011134Return a version of the string where each word is titlecased.
11135
11136More specifically, words start with uppercased characters and all remaining
11137cased characters have lower case.
11138[clinic start generated code]*/
11139
11140static PyObject *
11141unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011142/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011144 if (PyUnicode_READY(self) == -1)
11145 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011146 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147}
11148
INADA Naoki3ae20562017-01-16 20:41:20 +090011149/*[clinic input]
11150str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151
INADA Naoki3ae20562017-01-16 20:41:20 +090011152Return a capitalized version of the string.
11153
11154More specifically, make the first character have upper case and the rest lower
11155case.
11156[clinic start generated code]*/
11157
11158static PyObject *
11159unicode_capitalize_impl(PyObject *self)
11160/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011162 if (PyUnicode_READY(self) == -1)
11163 return NULL;
11164 if (PyUnicode_GET_LENGTH(self) == 0)
11165 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011166 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167}
11168
INADA Naoki3ae20562017-01-16 20:41:20 +090011169/*[clinic input]
11170str.casefold as unicode_casefold
11171
11172Return a version of the string suitable for caseless comparisons.
11173[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011174
11175static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011176unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011177/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011178{
11179 if (PyUnicode_READY(self) == -1)
11180 return NULL;
11181 if (PyUnicode_IS_ASCII(self))
11182 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011183 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011184}
11185
11186
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011187/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011188
11189static int
11190convert_uc(PyObject *obj, void *addr)
11191{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011193
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011194 if (!PyUnicode_Check(obj)) {
11195 PyErr_Format(PyExc_TypeError,
11196 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011197 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011198 return 0;
11199 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011200 if (PyUnicode_READY(obj) < 0)
11201 return 0;
11202 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011203 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011204 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011205 return 0;
11206 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011207 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011208 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011209}
11210
INADA Naoki3ae20562017-01-16 20:41:20 +090011211/*[clinic input]
11212str.center as unicode_center
11213
11214 width: Py_ssize_t
11215 fillchar: Py_UCS4 = ' '
11216 /
11217
11218Return a centered string of length width.
11219
11220Padding is done using the specified fill character (default is a space).
11221[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222
11223static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011224unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11225/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011227 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228
Benjamin Petersonbac79492012-01-14 13:34:47 -050011229 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230 return NULL;
11231
Victor Stinnerc4b49542011-12-11 22:44:26 +010011232 if (PyUnicode_GET_LENGTH(self) >= width)
11233 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234
Victor Stinnerc4b49542011-12-11 22:44:26 +010011235 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236 left = marg / 2 + (marg & width & 1);
11237
Victor Stinner9310abb2011-10-05 00:59:23 +020011238 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239}
11240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241/* This function assumes that str1 and str2 are readied by the caller. */
11242
Marc-André Lemburge5034372000-08-08 08:04:29 +000011243static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011244unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011245{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011246#define COMPARE(TYPE1, TYPE2) \
11247 do { \
11248 TYPE1* p1 = (TYPE1 *)data1; \
11249 TYPE2* p2 = (TYPE2 *)data2; \
11250 TYPE1* end = p1 + len; \
11251 Py_UCS4 c1, c2; \
11252 for (; p1 != end; p1++, p2++) { \
11253 c1 = *p1; \
11254 c2 = *p2; \
11255 if (c1 != c2) \
11256 return (c1 < c2) ? -1 : 1; \
11257 } \
11258 } \
11259 while (0)
11260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011262 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011263 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 kind1 = PyUnicode_KIND(str1);
11266 kind2 = PyUnicode_KIND(str2);
11267 data1 = PyUnicode_DATA(str1);
11268 data2 = PyUnicode_DATA(str2);
11269 len1 = PyUnicode_GET_LENGTH(str1);
11270 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011271 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011272
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011273 switch(kind1) {
11274 case PyUnicode_1BYTE_KIND:
11275 {
11276 switch(kind2) {
11277 case PyUnicode_1BYTE_KIND:
11278 {
11279 int cmp = memcmp(data1, data2, len);
11280 /* normalize result of memcmp() into the range [-1; 1] */
11281 if (cmp < 0)
11282 return -1;
11283 if (cmp > 0)
11284 return 1;
11285 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011286 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011287 case PyUnicode_2BYTE_KIND:
11288 COMPARE(Py_UCS1, Py_UCS2);
11289 break;
11290 case PyUnicode_4BYTE_KIND:
11291 COMPARE(Py_UCS1, Py_UCS4);
11292 break;
11293 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011294 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011295 }
11296 break;
11297 }
11298 case PyUnicode_2BYTE_KIND:
11299 {
11300 switch(kind2) {
11301 case PyUnicode_1BYTE_KIND:
11302 COMPARE(Py_UCS2, Py_UCS1);
11303 break;
11304 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011305 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011306 COMPARE(Py_UCS2, Py_UCS2);
11307 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011308 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011309 case PyUnicode_4BYTE_KIND:
11310 COMPARE(Py_UCS2, Py_UCS4);
11311 break;
11312 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011313 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011314 }
11315 break;
11316 }
11317 case PyUnicode_4BYTE_KIND:
11318 {
11319 switch(kind2) {
11320 case PyUnicode_1BYTE_KIND:
11321 COMPARE(Py_UCS4, Py_UCS1);
11322 break;
11323 case PyUnicode_2BYTE_KIND:
11324 COMPARE(Py_UCS4, Py_UCS2);
11325 break;
11326 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011327 {
11328#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11329 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11330 /* normalize result of wmemcmp() into the range [-1; 1] */
11331 if (cmp < 0)
11332 return -1;
11333 if (cmp > 0)
11334 return 1;
11335#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011336 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011337#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011338 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011339 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011340 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011341 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011342 }
11343 break;
11344 }
11345 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011346 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011347 }
11348
Victor Stinner770e19e2012-10-04 22:59:45 +020011349 if (len1 == len2)
11350 return 0;
11351 if (len1 < len2)
11352 return -1;
11353 else
11354 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011355
11356#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011357}
11358
Benjamin Peterson621b4302016-09-09 13:54:34 -070011359static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011360unicode_compare_eq(PyObject *str1, PyObject *str2)
11361{
11362 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011363 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011364 Py_ssize_t len;
11365 int cmp;
11366
Victor Stinnere5567ad2012-10-23 02:48:49 +020011367 len = PyUnicode_GET_LENGTH(str1);
11368 if (PyUnicode_GET_LENGTH(str2) != len)
11369 return 0;
11370 kind = PyUnicode_KIND(str1);
11371 if (PyUnicode_KIND(str2) != kind)
11372 return 0;
11373 data1 = PyUnicode_DATA(str1);
11374 data2 = PyUnicode_DATA(str2);
11375
11376 cmp = memcmp(data1, data2, len * kind);
11377 return (cmp == 0);
11378}
11379
11380
Alexander Belopolsky40018472011-02-26 01:02:56 +000011381int
11382PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11385 if (PyUnicode_READY(left) == -1 ||
11386 PyUnicode_READY(right) == -1)
11387 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011388
11389 /* a string is equal to itself */
11390 if (left == right)
11391 return 0;
11392
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011393 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011395 PyErr_Format(PyExc_TypeError,
11396 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011397 Py_TYPE(left)->tp_name,
11398 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399 return -1;
11400}
11401
Martin v. Löwis5b222132007-06-10 09:51:05 +000011402int
11403PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11404{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 Py_ssize_t i;
11406 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011408 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409
Victor Stinner910337b2011-10-03 03:20:16 +020011410 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011411 if (!PyUnicode_IS_READY(uni)) {
11412 const wchar_t *ws = _PyUnicode_WSTR(uni);
11413 /* Compare Unicode string and source character set string */
11414 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11415 if (chr != ustr[i])
11416 return (chr < ustr[i]) ? -1 : 1;
11417 }
11418 /* This check keeps Python strings that end in '\0' from comparing equal
11419 to C strings identical up to that point. */
11420 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11421 return 1; /* uni is longer */
11422 if (ustr[i])
11423 return -1; /* str is longer */
11424 return 0;
11425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011427 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011428 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011429 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011430 size_t len, len2 = strlen(str);
11431 int cmp;
11432
11433 len = Py_MIN(len1, len2);
11434 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011435 if (cmp != 0) {
11436 if (cmp < 0)
11437 return -1;
11438 else
11439 return 1;
11440 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011441 if (len1 > len2)
11442 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011443 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011444 return -1; /* str is longer */
11445 return 0;
11446 }
11447 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011448 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011449 /* Compare Unicode string and source character set string */
11450 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011451 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011452 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11453 /* This check keeps Python strings that end in '\0' from comparing equal
11454 to C strings identical up to that point. */
11455 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11456 return 1; /* uni is longer */
11457 if (str[i])
11458 return -1; /* str is longer */
11459 return 0;
11460 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011461}
11462
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011463static int
11464non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11465{
11466 size_t i, len;
11467 const wchar_t *p;
11468 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11469 if (strlen(str) != len)
11470 return 0;
11471 p = _PyUnicode_WSTR(unicode);
11472 assert(p);
11473 for (i = 0; i < len; i++) {
11474 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011475 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011476 return 0;
11477 }
11478 return 1;
11479}
11480
11481int
11482_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11483{
11484 size_t len;
11485 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011486 assert(str);
11487#ifndef NDEBUG
11488 for (const char *p = str; *p; p++) {
11489 assert((unsigned char)*p < 128);
11490 }
11491#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011492 if (PyUnicode_READY(unicode) == -1) {
11493 /* Memory error or bad data */
11494 PyErr_Clear();
11495 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11496 }
11497 if (!PyUnicode_IS_ASCII(unicode))
11498 return 0;
11499 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11500 return strlen(str) == len &&
11501 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11502}
11503
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011504int
11505_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11506{
11507 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011508
11509 assert(_PyUnicode_CHECK(left));
11510 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011511#ifndef NDEBUG
11512 for (const char *p = right->string; *p; p++) {
11513 assert((unsigned char)*p < 128);
11514 }
11515#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011516
11517 if (PyUnicode_READY(left) == -1) {
11518 /* memory error or bad data */
11519 PyErr_Clear();
11520 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11521 }
11522
11523 if (!PyUnicode_IS_ASCII(left))
11524 return 0;
11525
11526 right_uni = _PyUnicode_FromId(right); /* borrowed */
11527 if (right_uni == NULL) {
11528 /* memory error or bad data */
11529 PyErr_Clear();
11530 return _PyUnicode_EqualToASCIIString(left, right->string);
11531 }
11532
11533 if (left == right_uni)
11534 return 1;
11535
11536 if (PyUnicode_CHECK_INTERNED(left))
11537 return 0;
11538
Victor Stinner607b1022020-05-05 18:50:30 +020011539#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011540 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011541 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011542 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11543 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011544#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011545
11546 return unicode_compare_eq(left, right_uni);
11547}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011548
Alexander Belopolsky40018472011-02-26 01:02:56 +000011549PyObject *
11550PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011551{
11552 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011553
Victor Stinnere5567ad2012-10-23 02:48:49 +020011554 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11555 Py_RETURN_NOTIMPLEMENTED;
11556
11557 if (PyUnicode_READY(left) == -1 ||
11558 PyUnicode_READY(right) == -1)
11559 return NULL;
11560
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011561 if (left == right) {
11562 switch (op) {
11563 case Py_EQ:
11564 case Py_LE:
11565 case Py_GE:
11566 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011567 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011568 case Py_NE:
11569 case Py_LT:
11570 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011571 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011572 default:
11573 PyErr_BadArgument();
11574 return NULL;
11575 }
11576 }
11577 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011578 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011579 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011580 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011581 }
11582 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011583 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011584 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011585 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011586}
11587
Alexander Belopolsky40018472011-02-26 01:02:56 +000011588int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011589_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11590{
11591 return unicode_eq(aa, bb);
11592}
11593
11594int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011595PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011596{
Victor Stinner77282cb2013-04-14 19:22:47 +020011597 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011598 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011600 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011601
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011602 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011604 "'in <string>' requires string as left operand, not %.100s",
11605 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011606 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011607 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011608 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011609 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011610 if (ensure_unicode(str) < 0)
11611 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011614 kind2 = PyUnicode_KIND(substr);
11615 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011616 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011618 len2 = PyUnicode_GET_LENGTH(substr);
11619 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011620 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011621 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011622 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011623 if (len2 == 1) {
11624 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11625 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011626 return result;
11627 }
11628 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011629 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011630 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011631 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633
Victor Stinner77282cb2013-04-14 19:22:47 +020011634 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 case PyUnicode_1BYTE_KIND:
11636 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11637 break;
11638 case PyUnicode_2BYTE_KIND:
11639 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11640 break;
11641 case PyUnicode_4BYTE_KIND:
11642 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11643 break;
11644 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011645 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011647
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011648 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011649 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011650 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651
Guido van Rossum403d68b2000-03-13 15:55:09 +000011652 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011653}
11654
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655/* Concat to string or Unicode object giving a new Unicode object. */
11656
Alexander Belopolsky40018472011-02-26 01:02:56 +000011657PyObject *
11658PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011660 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011661 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011662 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011664 if (ensure_unicode(left) < 0)
11665 return NULL;
11666
11667 if (!PyUnicode_Check(right)) {
11668 PyErr_Format(PyExc_TypeError,
11669 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011670 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011671 return NULL;
11672 }
11673 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011674 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675
11676 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011677 PyObject *empty = unicode_get_empty(); // Borrowed reference
11678 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011679 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011680 }
11681 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011682 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011685 left_len = PyUnicode_GET_LENGTH(left);
11686 right_len = PyUnicode_GET_LENGTH(right);
11687 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011688 PyErr_SetString(PyExc_OverflowError,
11689 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011690 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011691 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011692 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011693
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011694 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11695 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011696 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011699 result = PyUnicode_New(new_len, maxchar);
11700 if (result == NULL)
11701 return NULL;
11702 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11703 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11704 assert(_PyUnicode_CheckConsistency(result, 1));
11705 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706}
11707
Walter Dörwald1ab83302007-05-18 17:15:44 +000011708void
Victor Stinner23e56682011-10-03 03:54:37 +020011709PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011710{
Victor Stinner23e56682011-10-03 03:54:37 +020011711 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011712 Py_UCS4 maxchar, maxchar2;
11713 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011714
11715 if (p_left == NULL) {
11716 if (!PyErr_Occurred())
11717 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011718 return;
11719 }
Victor Stinner23e56682011-10-03 03:54:37 +020011720 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011721 if (right == NULL || left == NULL
11722 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011723 if (!PyErr_Occurred())
11724 PyErr_BadInternalCall();
11725 goto error;
11726 }
11727
Benjamin Petersonbac79492012-01-14 13:34:47 -050011728 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011729 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011730 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011731 goto error;
11732
Victor Stinner488fa492011-12-12 00:01:39 +010011733 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011734 PyObject *empty = unicode_get_empty(); // Borrowed reference
11735 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011736 Py_DECREF(left);
11737 Py_INCREF(right);
11738 *p_left = right;
11739 return;
11740 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011741 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011742 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011743 }
Victor Stinner488fa492011-12-12 00:01:39 +010011744
11745 left_len = PyUnicode_GET_LENGTH(left);
11746 right_len = PyUnicode_GET_LENGTH(right);
11747 if (left_len > PY_SSIZE_T_MAX - right_len) {
11748 PyErr_SetString(PyExc_OverflowError,
11749 "strings are too large to concat");
11750 goto error;
11751 }
11752 new_len = left_len + right_len;
11753
11754 if (unicode_modifiable(left)
11755 && PyUnicode_CheckExact(right)
11756 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011757 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11758 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011759 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011760 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011761 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11762 {
11763 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011764 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011765 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011766
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011767 /* copy 'right' into the newly allocated area of 'left' */
11768 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011769 }
Victor Stinner488fa492011-12-12 00:01:39 +010011770 else {
11771 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11772 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011773 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011774
Victor Stinner488fa492011-12-12 00:01:39 +010011775 /* Concat the two Unicode strings */
11776 res = PyUnicode_New(new_len, maxchar);
11777 if (res == NULL)
11778 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011779 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11780 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011781 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011782 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011783 }
11784 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011785 return;
11786
11787error:
Victor Stinner488fa492011-12-12 00:01:39 +010011788 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011789}
11790
11791void
11792PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11793{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011794 PyUnicode_Append(pleft, right);
11795 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011796}
11797
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011798/*
11799Wraps stringlib_parse_args_finds() and additionally ensures that the
11800first argument is a unicode object.
11801*/
11802
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011803static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011804parse_args_finds_unicode(const char * function_name, PyObject *args,
11805 PyObject **substring,
11806 Py_ssize_t *start, Py_ssize_t *end)
11807{
11808 if(stringlib_parse_args_finds(function_name, args, substring,
11809 start, end)) {
11810 if (ensure_unicode(*substring) < 0)
11811 return 0;
11812 return 1;
11813 }
11814 return 0;
11815}
11816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011817PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011820Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011821string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011822interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823
11824static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011825unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011827 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011828 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011829 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011831 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011832 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011835 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 kind1 = PyUnicode_KIND(self);
11839 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011840 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011841 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 len1 = PyUnicode_GET_LENGTH(self);
11844 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011846 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011847 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011848
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011849 buf1 = PyUnicode_DATA(self);
11850 buf2 = PyUnicode_DATA(substring);
11851 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011852 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011853 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011854 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011855 }
11856 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 case PyUnicode_1BYTE_KIND:
11858 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011859 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 buf2, len2, PY_SSIZE_T_MAX
11861 );
11862 break;
11863 case PyUnicode_2BYTE_KIND:
11864 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011865 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 buf2, len2, PY_SSIZE_T_MAX
11867 );
11868 break;
11869 case PyUnicode_4BYTE_KIND:
11870 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011871 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 buf2, len2, PY_SSIZE_T_MAX
11873 );
11874 break;
11875 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011876 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 }
11878
11879 result = PyLong_FromSsize_t(iresult);
11880
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011881 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011882 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011883 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885 return result;
11886}
11887
INADA Naoki3ae20562017-01-16 20:41:20 +090011888/*[clinic input]
11889str.encode as unicode_encode
11890
11891 encoding: str(c_default="NULL") = 'utf-8'
11892 The encoding in which to encode the string.
11893 errors: str(c_default="NULL") = 'strict'
11894 The error handling scheme to use for encoding errors.
11895 The default is 'strict' meaning that encoding errors raise a
11896 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11897 'xmlcharrefreplace' as well as any other name registered with
11898 codecs.register_error that can handle UnicodeEncodeErrors.
11899
11900Encode the string using the codec registered for encoding.
11901[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902
11903static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011904unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011905/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011907 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011908}
11909
INADA Naoki3ae20562017-01-16 20:41:20 +090011910/*[clinic input]
11911str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
INADA Naoki3ae20562017-01-16 20:41:20 +090011913 tabsize: int = 8
11914
11915Return a copy where all tab characters are expanded using spaces.
11916
11917If tabsize is not given, a tab size of 8 characters is assumed.
11918[clinic start generated code]*/
11919
11920static PyObject *
11921unicode_expandtabs_impl(PyObject *self, int tabsize)
11922/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011924 Py_ssize_t i, j, line_pos, src_len, incr;
11925 Py_UCS4 ch;
11926 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011927 const void *src_data;
11928 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011929 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011930 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
Antoine Pitrou22425222011-10-04 19:10:51 +020011932 if (PyUnicode_READY(self) == -1)
11933 return NULL;
11934
Thomas Wouters7e474022000-07-16 12:04:32 +000011935 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011936 src_len = PyUnicode_GET_LENGTH(self);
11937 i = j = line_pos = 0;
11938 kind = PyUnicode_KIND(self);
11939 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011940 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011941 for (; i < src_len; i++) {
11942 ch = PyUnicode_READ(kind, src_data, i);
11943 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011944 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011946 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011948 goto overflow;
11949 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011951 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011955 goto overflow;
11956 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011958 if (ch == '\n' || ch == '\r')
11959 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011961 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011962 if (!found)
11963 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011964
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011966 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967 if (!u)
11968 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011969 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970
Antoine Pitroue71d5742011-10-04 15:55:09 +020011971 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972
Antoine Pitroue71d5742011-10-04 15:55:09 +020011973 for (; i < src_len; i++) {
11974 ch = PyUnicode_READ(kind, src_data, i);
11975 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011977 incr = tabsize - (line_pos % tabsize);
11978 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011979 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011980 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011982 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011983 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011984 line_pos++;
11985 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011986 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011987 if (ch == '\n' || ch == '\r')
11988 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011990 }
11991 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011992 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011993
Antoine Pitroue71d5742011-10-04 15:55:09 +020011994 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011995 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11996 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997}
11998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011999PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001\n\
12002Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012003such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004arguments start and end are interpreted as in slice notation.\n\
12005\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012006Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007
12008static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012011 /* initialize variables to prevent gcc warning */
12012 PyObject *substring = NULL;
12013 Py_ssize_t start = 0;
12014 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012015 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012017 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012020 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012023 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 if (result == -2)
12026 return NULL;
12027
Christian Heimes217cfd12007-12-02 14:31:20 +000012028 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029}
12030
12031static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012032unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012034 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012035 enum PyUnicode_Kind kind;
12036 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012037
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012038 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012039 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012041 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012042 if (PyUnicode_READY(self) == -1) {
12043 return NULL;
12044 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012045 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12046 PyErr_SetString(PyExc_IndexError, "string index out of range");
12047 return NULL;
12048 }
12049 kind = PyUnicode_KIND(self);
12050 data = PyUnicode_DATA(self);
12051 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010012052 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053}
12054
Guido van Rossumc2504932007-09-18 19:42:40 +000012055/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010012056 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000012057static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012058unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080012060 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000012061
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012062#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050012063 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012064#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 if (_PyUnicode_HASH(self) != -1)
12066 return _PyUnicode_HASH(self);
12067 if (PyUnicode_READY(self) == -1)
12068 return -1;
animalizea1d14252019-01-02 20:16:06 +080012069
Christian Heimes985ecdc2013-11-20 11:46:18 +010012070 x = _Py_HashBytes(PyUnicode_DATA(self),
12071 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000012073 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074}
12075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012076PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012077 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078\n\
oldkaa0735f2018-02-02 16:52:55 +080012079Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012080such that sub is contained within S[start:end]. Optional\n\
12081arguments start and end are interpreted as in slice notation.\n\
12082\n\
12083Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084
12085static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012088 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012089 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012090 PyObject *substring = NULL;
12091 Py_ssize_t start = 0;
12092 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012094 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012097 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012100 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 if (result == -2)
12103 return NULL;
12104
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105 if (result < 0) {
12106 PyErr_SetString(PyExc_ValueError, "substring not found");
12107 return NULL;
12108 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012109
Christian Heimes217cfd12007-12-02 14:31:20 +000012110 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111}
12112
INADA Naoki3ae20562017-01-16 20:41:20 +090012113/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012114str.isascii as unicode_isascii
12115
12116Return True if all characters in the string are ASCII, False otherwise.
12117
12118ASCII characters have code points in the range U+0000-U+007F.
12119Empty string is ASCII too.
12120[clinic start generated code]*/
12121
12122static PyObject *
12123unicode_isascii_impl(PyObject *self)
12124/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12125{
12126 if (PyUnicode_READY(self) == -1) {
12127 return NULL;
12128 }
12129 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12130}
12131
12132/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012133str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134
INADA Naoki3ae20562017-01-16 20:41:20 +090012135Return True if the string is a lowercase string, False otherwise.
12136
12137A string is lowercase if all cased characters in the string are lowercase and
12138there is at least one cased character in the string.
12139[clinic start generated code]*/
12140
12141static PyObject *
12142unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012143/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 Py_ssize_t i, length;
12146 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012147 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148 int cased;
12149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 if (PyUnicode_READY(self) == -1)
12151 return NULL;
12152 length = PyUnicode_GET_LENGTH(self);
12153 kind = PyUnicode_KIND(self);
12154 data = PyUnicode_DATA(self);
12155
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 if (length == 1)
12158 return PyBool_FromLong(
12159 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012161 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012163 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012164
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 for (i = 0; i < length; i++) {
12167 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012168
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012170 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012171 else if (!cased && Py_UNICODE_ISLOWER(ch))
12172 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012174 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175}
12176
INADA Naoki3ae20562017-01-16 20:41:20 +090012177/*[clinic input]
12178str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179
INADA Naoki3ae20562017-01-16 20:41:20 +090012180Return True if the string is an uppercase string, False otherwise.
12181
12182A string is uppercase if all cased characters in the string are uppercase and
12183there is at least one cased character in the string.
12184[clinic start generated code]*/
12185
12186static PyObject *
12187unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012188/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 Py_ssize_t i, length;
12191 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012192 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 int cased;
12194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 if (PyUnicode_READY(self) == -1)
12196 return NULL;
12197 length = PyUnicode_GET_LENGTH(self);
12198 kind = PyUnicode_KIND(self);
12199 data = PyUnicode_DATA(self);
12200
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (length == 1)
12203 return PyBool_FromLong(
12204 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012206 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012208 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012209
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 for (i = 0; i < length; i++) {
12212 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012213
Benjamin Peterson29060642009-01-31 22:14:21 +000012214 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012215 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012216 else if (!cased && Py_UNICODE_ISUPPER(ch))
12217 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012219 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220}
12221
INADA Naoki3ae20562017-01-16 20:41:20 +090012222/*[clinic input]
12223str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224
INADA Naoki3ae20562017-01-16 20:41:20 +090012225Return True if the string is a title-cased string, False otherwise.
12226
12227In a title-cased string, upper- and title-case characters may only
12228follow uncased characters and lowercase characters only cased ones.
12229[clinic start generated code]*/
12230
12231static PyObject *
12232unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012233/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 Py_ssize_t i, length;
12236 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012237 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238 int cased, previous_is_cased;
12239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 if (PyUnicode_READY(self) == -1)
12241 return NULL;
12242 length = PyUnicode_GET_LENGTH(self);
12243 kind = PyUnicode_KIND(self);
12244 data = PyUnicode_DATA(self);
12245
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 if (length == 1) {
12248 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12249 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12250 (Py_UNICODE_ISUPPER(ch) != 0));
12251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012253 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012255 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012256
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257 cased = 0;
12258 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 for (i = 0; i < length; i++) {
12260 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012261
Benjamin Peterson29060642009-01-31 22:14:21 +000012262 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12263 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012264 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012265 previous_is_cased = 1;
12266 cased = 1;
12267 }
12268 else if (Py_UNICODE_ISLOWER(ch)) {
12269 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012270 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012271 previous_is_cased = 1;
12272 cased = 1;
12273 }
12274 else
12275 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012277 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278}
12279
INADA Naoki3ae20562017-01-16 20:41:20 +090012280/*[clinic input]
12281str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
INADA Naoki3ae20562017-01-16 20:41:20 +090012283Return True if the string is a whitespace string, False otherwise.
12284
12285A string is whitespace if all characters in the string are whitespace and there
12286is at least one character in the string.
12287[clinic start generated code]*/
12288
12289static PyObject *
12290unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012291/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 Py_ssize_t i, length;
12294 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012295 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296
12297 if (PyUnicode_READY(self) == -1)
12298 return NULL;
12299 length = PyUnicode_GET_LENGTH(self);
12300 kind = PyUnicode_KIND(self);
12301 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 if (length == 1)
12305 return PyBool_FromLong(
12306 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012308 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012310 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 for (i = 0; i < length; i++) {
12313 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012314 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012315 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012317 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318}
12319
INADA Naoki3ae20562017-01-16 20:41:20 +090012320/*[clinic input]
12321str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012322
INADA Naoki3ae20562017-01-16 20:41:20 +090012323Return True if the string is an alphabetic string, False otherwise.
12324
12325A string is alphabetic if all characters in the string are alphabetic and there
12326is at least one character in the string.
12327[clinic start generated code]*/
12328
12329static PyObject *
12330unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012331/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 Py_ssize_t i, length;
12334 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012335 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336
12337 if (PyUnicode_READY(self) == -1)
12338 return NULL;
12339 length = PyUnicode_GET_LENGTH(self);
12340 kind = PyUnicode_KIND(self);
12341 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012342
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012343 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 if (length == 1)
12345 return PyBool_FromLong(
12346 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012347
12348 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012350 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 for (i = 0; i < length; i++) {
12353 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012354 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012355 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012356 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012357}
12358
INADA Naoki3ae20562017-01-16 20:41:20 +090012359/*[clinic input]
12360str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012361
INADA Naoki3ae20562017-01-16 20:41:20 +090012362Return True if the string is an alpha-numeric string, False otherwise.
12363
12364A string is alpha-numeric if all characters in the string are alpha-numeric and
12365there is at least one character in the string.
12366[clinic start generated code]*/
12367
12368static PyObject *
12369unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012370/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012371{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012373 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 Py_ssize_t len, i;
12375
12376 if (PyUnicode_READY(self) == -1)
12377 return NULL;
12378
12379 kind = PyUnicode_KIND(self);
12380 data = PyUnicode_DATA(self);
12381 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012382
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012383 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 if (len == 1) {
12385 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12386 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12387 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012388
12389 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012391 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 for (i = 0; i < len; i++) {
12394 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012395 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012396 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012397 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012398 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012399}
12400
INADA Naoki3ae20562017-01-16 20:41:20 +090012401/*[clinic input]
12402str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403
INADA Naoki3ae20562017-01-16 20:41:20 +090012404Return True if the string is a decimal string, False otherwise.
12405
12406A string is a decimal string if all characters in the string are decimal and
12407there is at least one character in the string.
12408[clinic start generated code]*/
12409
12410static PyObject *
12411unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012412/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414 Py_ssize_t i, length;
12415 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012416 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417
12418 if (PyUnicode_READY(self) == -1)
12419 return NULL;
12420 length = PyUnicode_GET_LENGTH(self);
12421 kind = PyUnicode_KIND(self);
12422 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012423
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 if (length == 1)
12426 return PyBool_FromLong(
12427 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012429 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012431 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 for (i = 0; i < length; i++) {
12434 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012435 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012437 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012438}
12439
INADA Naoki3ae20562017-01-16 20:41:20 +090012440/*[clinic input]
12441str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442
INADA Naoki3ae20562017-01-16 20:41:20 +090012443Return True if the string is a digit string, False otherwise.
12444
12445A string is a digit string if all characters in the string are digits and there
12446is at least one character in the string.
12447[clinic start generated code]*/
12448
12449static PyObject *
12450unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012451/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 Py_ssize_t i, length;
12454 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012455 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456
12457 if (PyUnicode_READY(self) == -1)
12458 return NULL;
12459 length = PyUnicode_GET_LENGTH(self);
12460 kind = PyUnicode_KIND(self);
12461 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 if (length == 1) {
12465 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12466 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012469 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012471 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 for (i = 0; i < length; i++) {
12474 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012475 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012477 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478}
12479
INADA Naoki3ae20562017-01-16 20:41:20 +090012480/*[clinic input]
12481str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482
INADA Naoki3ae20562017-01-16 20:41:20 +090012483Return True if the string is a numeric string, False otherwise.
12484
12485A string is numeric if all characters in the string are numeric and there is at
12486least one character in the string.
12487[clinic start generated code]*/
12488
12489static PyObject *
12490unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012491/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 Py_ssize_t i, length;
12494 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012495 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496
12497 if (PyUnicode_READY(self) == -1)
12498 return NULL;
12499 length = PyUnicode_GET_LENGTH(self);
12500 kind = PyUnicode_KIND(self);
12501 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 if (length == 1)
12505 return PyBool_FromLong(
12506 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012508 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012510 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 for (i = 0; i < length; i++) {
12513 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012514 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012516 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517}
12518
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012519Py_ssize_t
12520_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012521{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012523 if (PyUnicode_READY(self) == -1)
12524 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012525
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012526 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012527 if (len == 0) {
12528 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530 }
12531
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012532 int kind = PyUnicode_KIND(self);
12533 const void *data = PyUnicode_DATA(self);
12534 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012535 /* PEP 3131 says that the first character must be in
12536 XID_Start and subsequent characters in XID_Continue,
12537 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012538 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012539 letters, digits, underscore). However, given the current
12540 definition of XID_Start and XID_Continue, it is sufficient
12541 to check just for these, except that _ must be allowed
12542 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012543 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012544 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012545 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012546
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012547 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012548 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012549 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012550 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012551 }
12552 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012553 return i;
12554}
12555
12556int
12557PyUnicode_IsIdentifier(PyObject *self)
12558{
12559 if (PyUnicode_IS_READY(self)) {
12560 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12561 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12562 /* an empty string is not a valid identifier */
12563 return len && i == len;
12564 }
12565 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012566_Py_COMP_DIAG_PUSH
12567_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012568 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012569 if (len == 0) {
12570 /* an empty string is not a valid identifier */
12571 return 0;
12572 }
12573
12574 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012575 Py_UCS4 ch = wstr[i++];
12576#if SIZEOF_WCHAR_T == 2
12577 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12578 && i < len
12579 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12580 {
12581 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12582 i++;
12583 }
12584#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012585 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12586 return 0;
12587 }
12588
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012589 while (i < len) {
12590 ch = wstr[i++];
12591#if SIZEOF_WCHAR_T == 2
12592 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12593 && i < len
12594 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12595 {
12596 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12597 i++;
12598 }
12599#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012600 if (!_PyUnicode_IsXidContinue(ch)) {
12601 return 0;
12602 }
12603 }
12604 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012605_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012606 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012607}
12608
INADA Naoki3ae20562017-01-16 20:41:20 +090012609/*[clinic input]
12610str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012611
INADA Naoki3ae20562017-01-16 20:41:20 +090012612Return True if the string is a valid Python identifier, False otherwise.
12613
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012614Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012615such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012616[clinic start generated code]*/
12617
12618static PyObject *
12619unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012620/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012621{
12622 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12623}
12624
INADA Naoki3ae20562017-01-16 20:41:20 +090012625/*[clinic input]
12626str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012627
INADA Naoki3ae20562017-01-16 20:41:20 +090012628Return True if the string is printable, False otherwise.
12629
12630A string is printable if all of its characters are considered printable in
12631repr() or if it is empty.
12632[clinic start generated code]*/
12633
12634static PyObject *
12635unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012636/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012637{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 Py_ssize_t i, length;
12639 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012640 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641
12642 if (PyUnicode_READY(self) == -1)
12643 return NULL;
12644 length = PyUnicode_GET_LENGTH(self);
12645 kind = PyUnicode_KIND(self);
12646 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012647
12648 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 if (length == 1)
12650 return PyBool_FromLong(
12651 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 for (i = 0; i < length; i++) {
12654 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012655 Py_RETURN_FALSE;
12656 }
12657 }
12658 Py_RETURN_TRUE;
12659}
12660
INADA Naoki3ae20562017-01-16 20:41:20 +090012661/*[clinic input]
12662str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663
INADA Naoki3ae20562017-01-16 20:41:20 +090012664 iterable: object
12665 /
12666
12667Concatenate any number of strings.
12668
Martin Panter91a88662017-01-24 00:30:06 +000012669The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012670The result is returned as a new string.
12671
12672Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12673[clinic start generated code]*/
12674
12675static PyObject *
12676unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012677/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678{
INADA Naoki3ae20562017-01-16 20:41:20 +090012679 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680}
12681
Martin v. Löwis18e16552006-02-15 17:27:45 +000012682static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012683unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 if (PyUnicode_READY(self) == -1)
12686 return -1;
12687 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688}
12689
INADA Naoki3ae20562017-01-16 20:41:20 +090012690/*[clinic input]
12691str.ljust as unicode_ljust
12692
12693 width: Py_ssize_t
12694 fillchar: Py_UCS4 = ' '
12695 /
12696
12697Return a left-justified string of length width.
12698
12699Padding is done using the specified fill character (default is a space).
12700[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701
12702static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012703unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12704/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012706 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708
Victor Stinnerc4b49542011-12-11 22:44:26 +010012709 if (PyUnicode_GET_LENGTH(self) >= width)
12710 return unicode_result_unchanged(self);
12711
12712 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713}
12714
INADA Naoki3ae20562017-01-16 20:41:20 +090012715/*[clinic input]
12716str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717
INADA Naoki3ae20562017-01-16 20:41:20 +090012718Return a copy of the string converted to lowercase.
12719[clinic start generated code]*/
12720
12721static PyObject *
12722unicode_lower_impl(PyObject *self)
12723/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012725 if (PyUnicode_READY(self) == -1)
12726 return NULL;
12727 if (PyUnicode_IS_ASCII(self))
12728 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012729 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730}
12731
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012732#define LEFTSTRIP 0
12733#define RIGHTSTRIP 1
12734#define BOTHSTRIP 2
12735
12736/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012737static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012738
INADA Naoki3ae20562017-01-16 20:41:20 +090012739#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012740
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012741/* externally visible for str.strip(unicode) */
12742PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012743_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012744{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012745 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746 int kind;
12747 Py_ssize_t i, j, len;
12748 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012749 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12752 return NULL;
12753
12754 kind = PyUnicode_KIND(self);
12755 data = PyUnicode_DATA(self);
12756 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012757 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12759 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012760 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012761
Benjamin Peterson14339b62009-01-31 16:36:08 +000012762 i = 0;
12763 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012764 while (i < len) {
12765 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12766 if (!BLOOM(sepmask, ch))
12767 break;
12768 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12769 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 i++;
12771 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012772 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012773
Benjamin Peterson14339b62009-01-31 16:36:08 +000012774 j = len;
12775 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012776 j--;
12777 while (j >= i) {
12778 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12779 if (!BLOOM(sepmask, ch))
12780 break;
12781 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12782 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012783 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012784 }
12785
Benjamin Peterson29060642009-01-31 22:14:21 +000012786 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012787 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012788
Victor Stinner7931d9a2011-11-04 00:22:48 +010012789 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790}
12791
12792PyObject*
12793PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12794{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012795 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012797 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798
Victor Stinnerde636f32011-10-01 03:55:54 +020012799 if (PyUnicode_READY(self) == -1)
12800 return NULL;
12801
Victor Stinner684d5fd2012-05-03 02:32:34 +020012802 length = PyUnicode_GET_LENGTH(self);
12803 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012804
Victor Stinner684d5fd2012-05-03 02:32:34 +020012805 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012806 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807
Victor Stinnerde636f32011-10-01 03:55:54 +020012808 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012809 PyErr_SetString(PyExc_IndexError, "string index out of range");
12810 return NULL;
12811 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012812 if (start >= length || end < start)
12813 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012814
Victor Stinner684d5fd2012-05-03 02:32:34 +020012815 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012816 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012817 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012818 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012819 }
12820 else {
12821 kind = PyUnicode_KIND(self);
12822 data = PyUnicode_1BYTE_DATA(self);
12823 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012824 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012825 length);
12826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828
12829static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012830do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832 Py_ssize_t len, i, j;
12833
12834 if (PyUnicode_READY(self) == -1)
12835 return NULL;
12836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012837 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012838
Victor Stinnercc7af722013-04-09 22:39:24 +020012839 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012840 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012841
12842 i = 0;
12843 if (striptype != RIGHTSTRIP) {
12844 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012845 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012846 if (!_Py_ascii_whitespace[ch])
12847 break;
12848 i++;
12849 }
12850 }
12851
12852 j = len;
12853 if (striptype != LEFTSTRIP) {
12854 j--;
12855 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012856 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012857 if (!_Py_ascii_whitespace[ch])
12858 break;
12859 j--;
12860 }
12861 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012862 }
12863 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012864 else {
12865 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012866 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012867
Victor Stinnercc7af722013-04-09 22:39:24 +020012868 i = 0;
12869 if (striptype != RIGHTSTRIP) {
12870 while (i < len) {
12871 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12872 if (!Py_UNICODE_ISSPACE(ch))
12873 break;
12874 i++;
12875 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012876 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012877
12878 j = len;
12879 if (striptype != LEFTSTRIP) {
12880 j--;
12881 while (j >= i) {
12882 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12883 if (!Py_UNICODE_ISSPACE(ch))
12884 break;
12885 j--;
12886 }
12887 j++;
12888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012889 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012890
Victor Stinner7931d9a2011-11-04 00:22:48 +010012891 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892}
12893
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012894
12895static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012896do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012897{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012898 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012899 if (PyUnicode_Check(sep))
12900 return _PyUnicode_XStrip(self, striptype, sep);
12901 else {
12902 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012903 "%s arg must be None or str",
12904 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012905 return NULL;
12906 }
12907 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012908
Benjamin Peterson14339b62009-01-31 16:36:08 +000012909 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012910}
12911
12912
INADA Naoki3ae20562017-01-16 20:41:20 +090012913/*[clinic input]
12914str.strip as unicode_strip
12915
12916 chars: object = None
12917 /
12918
Zachary Ware09895c22019-10-09 16:09:00 -050012919Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012920
12921If chars is given and not None, remove characters in chars instead.
12922[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012923
12924static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012925unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012926/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012927{
INADA Naoki3ae20562017-01-16 20:41:20 +090012928 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012929}
12930
12931
INADA Naoki3ae20562017-01-16 20:41:20 +090012932/*[clinic input]
12933str.lstrip as unicode_lstrip
12934
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012935 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012936 /
12937
12938Return a copy of the string with leading whitespace removed.
12939
12940If chars is given and not None, remove characters in chars instead.
12941[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012942
12943static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012944unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012945/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012946{
INADA Naoki3ae20562017-01-16 20:41:20 +090012947 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012948}
12949
12950
INADA Naoki3ae20562017-01-16 20:41:20 +090012951/*[clinic input]
12952str.rstrip as unicode_rstrip
12953
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012954 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012955 /
12956
12957Return a copy of the string with trailing whitespace removed.
12958
12959If chars is given and not None, remove characters in chars instead.
12960[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012961
12962static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012963unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012964/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012965{
INADA Naoki3ae20562017-01-16 20:41:20 +090012966 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012967}
12968
12969
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012971unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012972{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012973 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012975
Serhiy Storchaka05997252013-01-26 12:14:02 +020012976 if (len < 1)
12977 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012978
Victor Stinnerc4b49542011-12-11 22:44:26 +010012979 /* no repeat, return original string */
12980 if (len == 1)
12981 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012982
Benjamin Petersonbac79492012-01-14 13:34:47 -050012983 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 return NULL;
12985
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012986 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012987 PyErr_SetString(PyExc_OverflowError,
12988 "repeated string is too long");
12989 return NULL;
12990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012992
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012993 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012994 if (!u)
12995 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012996 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012999 int kind = PyUnicode_KIND(str);
13000 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010013001 if (kind == PyUnicode_1BYTE_KIND) {
13002 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020013003 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010013004 }
13005 else if (kind == PyUnicode_2BYTE_KIND) {
13006 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020013007 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010013008 ucs2[n] = fill_char;
13009 } else {
13010 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
13011 assert(kind == PyUnicode_4BYTE_KIND);
13012 for (n = 0; n < len; ++n)
13013 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020013014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 }
13016 else {
13017 /* number of characters copied this far */
13018 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013019 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020013021 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000013023 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020013025 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013026 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000013027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000013028 }
13029
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013030 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013031 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032}
13033
Alexander Belopolsky40018472011-02-26 01:02:56 +000013034PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013035PyUnicode_Replace(PyObject *str,
13036 PyObject *substr,
13037 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000013038 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013039{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013040 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13041 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013042 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013043 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013044}
13045
INADA Naoki3ae20562017-01-16 20:41:20 +090013046/*[clinic input]
13047str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000013048
INADA Naoki3ae20562017-01-16 20:41:20 +090013049 old: unicode
13050 new: unicode
13051 count: Py_ssize_t = -1
13052 Maximum number of occurrences to replace.
13053 -1 (the default value) means replace all occurrences.
13054 /
13055
13056Return a copy with all occurrences of substring old replaced by new.
13057
13058If the optional argument count is given, only the first count occurrences are
13059replaced.
13060[clinic start generated code]*/
13061
13062static PyObject *
13063unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13064 Py_ssize_t count)
13065/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066{
Benjamin Peterson22a29702012-01-02 09:00:30 -060013067 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013068 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090013069 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070}
13071
sweeneydea81849b2020-04-22 17:05:48 -040013072/*[clinic input]
13073str.removeprefix as unicode_removeprefix
13074
13075 prefix: unicode
13076 /
13077
13078Return a str with the given prefix string removed if present.
13079
13080If the string starts with the prefix string, return string[len(prefix):].
13081Otherwise, return a copy of the original string.
13082[clinic start generated code]*/
13083
13084static PyObject *
13085unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13086/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13087{
13088 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13089 if (match == -1) {
13090 return NULL;
13091 }
13092 if (match) {
13093 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13094 PyUnicode_GET_LENGTH(self));
13095 }
13096 return unicode_result_unchanged(self);
13097}
13098
13099/*[clinic input]
13100str.removesuffix as unicode_removesuffix
13101
13102 suffix: unicode
13103 /
13104
13105Return a str with the given suffix string removed if present.
13106
13107If the string ends with the suffix string and that suffix is not empty,
13108return string[:-len(suffix)]. Otherwise, return a copy of the original
13109string.
13110[clinic start generated code]*/
13111
13112static PyObject *
13113unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13114/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13115{
13116 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13117 if (match == -1) {
13118 return NULL;
13119 }
13120 if (match) {
13121 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13122 - PyUnicode_GET_LENGTH(suffix));
13123 }
13124 return unicode_result_unchanged(self);
13125}
13126
Alexander Belopolsky40018472011-02-26 01:02:56 +000013127static PyObject *
13128unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013130 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 Py_ssize_t isize;
13132 Py_ssize_t osize, squote, dquote, i, o;
13133 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013134 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013135 const void *idata;
13136 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013139 return NULL;
13140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 isize = PyUnicode_GET_LENGTH(unicode);
13142 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013144 /* Compute length of output, quote characters, and
13145 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013146 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147 max = 127;
13148 squote = dquote = 0;
13149 ikind = PyUnicode_KIND(unicode);
13150 for (i = 0; i < isize; i++) {
13151 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013152 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013153 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013154 case '\'': squote++; break;
13155 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013157 incr = 2;
13158 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013159 default:
13160 /* Fast-path ASCII */
13161 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013162 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013164 ;
13165 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013167 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013168 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013169 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013170 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013172 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013174 if (osize > PY_SSIZE_T_MAX - incr) {
13175 PyErr_SetString(PyExc_OverflowError,
13176 "string is too long to generate repr");
13177 return NULL;
13178 }
13179 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013180 }
13181
13182 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013183 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013184 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013185 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013186 if (dquote)
13187 /* Both squote and dquote present. Use squote,
13188 and escape them */
13189 osize += squote;
13190 else
13191 quote = '"';
13192 }
Victor Stinner55c08782013-04-14 18:45:39 +020013193 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013194
13195 repr = PyUnicode_New(osize, max);
13196 if (repr == NULL)
13197 return NULL;
13198 okind = PyUnicode_KIND(repr);
13199 odata = PyUnicode_DATA(repr);
13200
13201 PyUnicode_WRITE(okind, odata, 0, quote);
13202 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013203 if (unchanged) {
13204 _PyUnicode_FastCopyCharacters(repr, 1,
13205 unicode, 0,
13206 isize);
13207 }
13208 else {
13209 for (i = 0, o = 1; i < isize; i++) {
13210 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211
Victor Stinner55c08782013-04-14 18:45:39 +020013212 /* Escape quotes and backslashes */
13213 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013214 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013216 continue;
13217 }
13218
13219 /* Map special whitespace to '\t', \n', '\r' */
13220 if (ch == '\t') {
13221 PyUnicode_WRITE(okind, odata, o++, '\\');
13222 PyUnicode_WRITE(okind, odata, o++, 't');
13223 }
13224 else if (ch == '\n') {
13225 PyUnicode_WRITE(okind, odata, o++, '\\');
13226 PyUnicode_WRITE(okind, odata, o++, 'n');
13227 }
13228 else if (ch == '\r') {
13229 PyUnicode_WRITE(okind, odata, o++, '\\');
13230 PyUnicode_WRITE(okind, odata, o++, 'r');
13231 }
13232
13233 /* Map non-printable US ASCII to '\xhh' */
13234 else if (ch < ' ' || ch == 0x7F) {
13235 PyUnicode_WRITE(okind, odata, o++, '\\');
13236 PyUnicode_WRITE(okind, odata, o++, 'x');
13237 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13238 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13239 }
13240
13241 /* Copy ASCII characters as-is */
13242 else if (ch < 0x7F) {
13243 PyUnicode_WRITE(okind, odata, o++, ch);
13244 }
13245
13246 /* Non-ASCII characters */
13247 else {
13248 /* Map Unicode whitespace and control characters
13249 (categories Z* and C* except ASCII space)
13250 */
13251 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13252 PyUnicode_WRITE(okind, odata, o++, '\\');
13253 /* Map 8-bit characters to '\xhh' */
13254 if (ch <= 0xff) {
13255 PyUnicode_WRITE(okind, odata, o++, 'x');
13256 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13257 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13258 }
13259 /* Map 16-bit characters to '\uxxxx' */
13260 else if (ch <= 0xffff) {
13261 PyUnicode_WRITE(okind, odata, o++, 'u');
13262 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13263 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13264 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13265 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13266 }
13267 /* Map 21-bit characters to '\U00xxxxxx' */
13268 else {
13269 PyUnicode_WRITE(okind, odata, o++, 'U');
13270 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13271 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13272 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13273 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13274 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13275 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13276 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13277 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13278 }
13279 }
13280 /* Copy characters as-is */
13281 else {
13282 PyUnicode_WRITE(okind, odata, o++, ch);
13283 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013284 }
13285 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013287 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013288 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013289 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013290}
13291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013292PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294\n\
13295Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013296such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297arguments start and end are interpreted as in slice notation.\n\
13298\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013299Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300
13301static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013304 /* initialize variables to prevent gcc warning */
13305 PyObject *substring = NULL;
13306 Py_ssize_t start = 0;
13307 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013308 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013310 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013312
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013313 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013314 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013315
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013316 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 if (result == -2)
13319 return NULL;
13320
Christian Heimes217cfd12007-12-02 14:31:20 +000013321 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013322}
13323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013324PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013325 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013327Return the highest index in S where substring sub is found,\n\
13328such that sub is contained within S[start:end]. Optional\n\
13329arguments start and end are interpreted as in slice notation.\n\
13330\n\
13331Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013332
13333static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013336 /* initialize variables to prevent gcc warning */
13337 PyObject *substring = NULL;
13338 Py_ssize_t start = 0;
13339 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013340 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013342 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013343 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013345 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013347
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013348 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 if (result == -2)
13351 return NULL;
13352
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353 if (result < 0) {
13354 PyErr_SetString(PyExc_ValueError, "substring not found");
13355 return NULL;
13356 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013357
Christian Heimes217cfd12007-12-02 14:31:20 +000013358 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013359}
13360
INADA Naoki3ae20562017-01-16 20:41:20 +090013361/*[clinic input]
13362str.rjust as unicode_rjust
13363
13364 width: Py_ssize_t
13365 fillchar: Py_UCS4 = ' '
13366 /
13367
13368Return a right-justified string of length width.
13369
13370Padding is done using the specified fill character (default is a space).
13371[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372
13373static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013374unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13375/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013376{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013377 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378 return NULL;
13379
Victor Stinnerc4b49542011-12-11 22:44:26 +010013380 if (PyUnicode_GET_LENGTH(self) >= width)
13381 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013382
Victor Stinnerc4b49542011-12-11 22:44:26 +010013383 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384}
13385
Alexander Belopolsky40018472011-02-26 01:02:56 +000013386PyObject *
13387PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013389 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013390 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013391
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013392 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013393}
13394
INADA Naoki3ae20562017-01-16 20:41:20 +090013395/*[clinic input]
13396str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397
INADA Naoki3ae20562017-01-16 20:41:20 +090013398 sep: object = None
13399 The delimiter according which to split the string.
13400 None (the default value) means split according to any whitespace,
13401 and discard empty strings from the result.
13402 maxsplit: Py_ssize_t = -1
13403 Maximum number of splits to do.
13404 -1 (the default value) means no limit.
13405
13406Return a list of the words in the string, using sep as the delimiter string.
13407[clinic start generated code]*/
13408
13409static PyObject *
13410unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13411/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013412{
INADA Naoki3ae20562017-01-16 20:41:20 +090013413 if (sep == Py_None)
13414 return split(self, NULL, maxsplit);
13415 if (PyUnicode_Check(sep))
13416 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013417
Victor Stinner998b8062018-09-12 00:23:25 +020013418 PyErr_Format(PyExc_TypeError,
13419 "must be str or None, not %.100s",
13420 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422}
13423
Thomas Wouters477c8d52006-05-27 19:21:47 +000013424PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013425PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013426{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013427 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013428 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013429 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013430 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013431
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013432 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013433 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013434
Victor Stinner14f8f022011-10-05 20:58:25 +020013435 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013436 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013437 len1 = PyUnicode_GET_LENGTH(str_obj);
13438 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013439 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013440 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013441 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013442 }
13443 buf1 = PyUnicode_DATA(str_obj);
13444 buf2 = PyUnicode_DATA(sep_obj);
13445 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013446 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013447 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013448 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013451 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013452 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013453 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13454 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13455 else
13456 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013457 break;
13458 case PyUnicode_2BYTE_KIND:
13459 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13460 break;
13461 case PyUnicode_4BYTE_KIND:
13462 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13463 break;
13464 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013465 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013466 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013467
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013468 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013469 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013470 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013471
13472 return out;
13473}
13474
13475
13476PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013477PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013478{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013479 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013480 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013481 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013482 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013483
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013484 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013485 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013486
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013487 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013488 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489 len1 = PyUnicode_GET_LENGTH(str_obj);
13490 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013491 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013492 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013493 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013494 }
13495 buf1 = PyUnicode_DATA(str_obj);
13496 buf2 = PyUnicode_DATA(sep_obj);
13497 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013498 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013499 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013500 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013501 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013502
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013503 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013504 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013505 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13506 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13507 else
13508 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013509 break;
13510 case PyUnicode_2BYTE_KIND:
13511 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13512 break;
13513 case PyUnicode_4BYTE_KIND:
13514 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13515 break;
13516 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013517 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013518 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013519
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013520 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013521 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013522 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013523
13524 return out;
13525}
13526
INADA Naoki3ae20562017-01-16 20:41:20 +090013527/*[clinic input]
13528str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013529
INADA Naoki3ae20562017-01-16 20:41:20 +090013530 sep: object
13531 /
13532
13533Partition the string into three parts using the given separator.
13534
13535This will search for the separator in the string. If the separator is found,
13536returns a 3-tuple containing the part before the separator, the separator
13537itself, and the part after it.
13538
13539If the separator is not found, returns a 3-tuple containing the original string
13540and two empty strings.
13541[clinic start generated code]*/
13542
13543static PyObject *
13544unicode_partition(PyObject *self, PyObject *sep)
13545/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013546{
INADA Naoki3ae20562017-01-16 20:41:20 +090013547 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013548}
13549
INADA Naoki3ae20562017-01-16 20:41:20 +090013550/*[clinic input]
13551str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013552
INADA Naoki3ae20562017-01-16 20:41:20 +090013553Partition the string into three parts using the given separator.
13554
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013555This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013556the separator is found, returns a 3-tuple containing the part before the
13557separator, the separator itself, and the part after it.
13558
13559If the separator is not found, returns a 3-tuple containing two empty strings
13560and the original string.
13561[clinic start generated code]*/
13562
13563static PyObject *
13564unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013565/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013566{
INADA Naoki3ae20562017-01-16 20:41:20 +090013567 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013568}
13569
Alexander Belopolsky40018472011-02-26 01:02:56 +000013570PyObject *
13571PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013572{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013573 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013574 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013575
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013576 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013577}
13578
INADA Naoki3ae20562017-01-16 20:41:20 +090013579/*[clinic input]
13580str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013581
INADA Naoki3ae20562017-01-16 20:41:20 +090013582Return a list of the words in the string, using sep as the delimiter string.
13583
13584Splits are done starting at the end of the string and working to the front.
13585[clinic start generated code]*/
13586
13587static PyObject *
13588unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13589/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013590{
INADA Naoki3ae20562017-01-16 20:41:20 +090013591 if (sep == Py_None)
13592 return rsplit(self, NULL, maxsplit);
13593 if (PyUnicode_Check(sep))
13594 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013595
Victor Stinner998b8062018-09-12 00:23:25 +020013596 PyErr_Format(PyExc_TypeError,
13597 "must be str or None, not %.100s",
13598 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013599 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013600}
13601
INADA Naoki3ae20562017-01-16 20:41:20 +090013602/*[clinic input]
13603str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013604
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013605 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013606
13607Return a list of the lines in the string, breaking at line boundaries.
13608
13609Line breaks are not included in the resulting list unless keepends is given and
13610true.
13611[clinic start generated code]*/
13612
13613static PyObject *
13614unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013615/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013616{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013617 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013618}
13619
13620static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013621PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013622{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013623 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013624}
13625
INADA Naoki3ae20562017-01-16 20:41:20 +090013626/*[clinic input]
13627str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013628
INADA Naoki3ae20562017-01-16 20:41:20 +090013629Convert uppercase characters to lowercase and lowercase characters to uppercase.
13630[clinic start generated code]*/
13631
13632static PyObject *
13633unicode_swapcase_impl(PyObject *self)
13634/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013635{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013636 if (PyUnicode_READY(self) == -1)
13637 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013638 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013639}
13640
Larry Hastings61272b72014-01-07 12:41:53 -080013641/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013642
Larry Hastings31826802013-10-19 00:09:25 -070013643@staticmethod
13644str.maketrans as unicode_maketrans
13645
13646 x: object
13647
13648 y: unicode=NULL
13649
13650 z: unicode=NULL
13651
13652 /
13653
13654Return a translation table usable for str.translate().
13655
13656If there is only one argument, it must be a dictionary mapping Unicode
13657ordinals (integers) or characters to Unicode ordinals, strings or None.
13658Character keys will be then converted to ordinals.
13659If there are two arguments, they must be strings of equal length, and
13660in the resulting dictionary, each character in x will be mapped to the
13661character at the same position in y. If there is a third argument, it
13662must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013663[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013664
Larry Hastings31826802013-10-19 00:09:25 -070013665static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013666unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013667/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013668{
Georg Brandlceee0772007-11-27 23:48:05 +000013669 PyObject *new = NULL, *key, *value;
13670 Py_ssize_t i = 0;
13671 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013672
Georg Brandlceee0772007-11-27 23:48:05 +000013673 new = PyDict_New();
13674 if (!new)
13675 return NULL;
13676 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013677 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013678 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013679
Georg Brandlceee0772007-11-27 23:48:05 +000013680 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013681 if (!PyUnicode_Check(x)) {
13682 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13683 "be a string if there is a second argument");
13684 goto err;
13685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013686 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013687 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13688 "arguments must have equal length");
13689 goto err;
13690 }
13691 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013692 x_kind = PyUnicode_KIND(x);
13693 y_kind = PyUnicode_KIND(y);
13694 x_data = PyUnicode_DATA(x);
13695 y_data = PyUnicode_DATA(y);
13696 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13697 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013698 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013699 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013700 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013701 if (!value) {
13702 Py_DECREF(key);
13703 goto err;
13704 }
Georg Brandlceee0772007-11-27 23:48:05 +000013705 res = PyDict_SetItem(new, key, value);
13706 Py_DECREF(key);
13707 Py_DECREF(value);
13708 if (res < 0)
13709 goto err;
13710 }
13711 /* create entries for deleting chars in z */
13712 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013713 z_kind = PyUnicode_KIND(z);
13714 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013715 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013716 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013717 if (!key)
13718 goto err;
13719 res = PyDict_SetItem(new, key, Py_None);
13720 Py_DECREF(key);
13721 if (res < 0)
13722 goto err;
13723 }
13724 }
13725 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013726 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013727 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013728
Georg Brandlceee0772007-11-27 23:48:05 +000013729 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013730 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013731 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13732 "to maketrans it must be a dict");
13733 goto err;
13734 }
13735 /* copy entries into the new dict, converting string keys to int keys */
13736 while (PyDict_Next(x, &i, &key, &value)) {
13737 if (PyUnicode_Check(key)) {
13738 /* convert string keys to integer keys */
13739 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013740 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013741 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13742 "table must be of length 1");
13743 goto err;
13744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013745 kind = PyUnicode_KIND(key);
13746 data = PyUnicode_DATA(key);
13747 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013748 if (!newkey)
13749 goto err;
13750 res = PyDict_SetItem(new, newkey, value);
13751 Py_DECREF(newkey);
13752 if (res < 0)
13753 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013754 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013755 /* just keep integer keys */
13756 if (PyDict_SetItem(new, key, value) < 0)
13757 goto err;
13758 } else {
13759 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13760 "be strings or integers");
13761 goto err;
13762 }
13763 }
13764 }
13765 return new;
13766 err:
13767 Py_DECREF(new);
13768 return NULL;
13769}
13770
INADA Naoki3ae20562017-01-16 20:41:20 +090013771/*[clinic input]
13772str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013773
INADA Naoki3ae20562017-01-16 20:41:20 +090013774 table: object
13775 Translation table, which must be a mapping of Unicode ordinals to
13776 Unicode ordinals, strings, or None.
13777 /
13778
13779Replace each character in the string using the given translation table.
13780
13781The table must implement lookup/indexing via __getitem__, for instance a
13782dictionary or list. If this operation raises LookupError, the character is
13783left untouched. Characters mapped to None are deleted.
13784[clinic start generated code]*/
13785
13786static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013787unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013788/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013789{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013790 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013791}
13792
INADA Naoki3ae20562017-01-16 20:41:20 +090013793/*[clinic input]
13794str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013795
INADA Naoki3ae20562017-01-16 20:41:20 +090013796Return a copy of the string converted to uppercase.
13797[clinic start generated code]*/
13798
13799static PyObject *
13800unicode_upper_impl(PyObject *self)
13801/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013802{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013803 if (PyUnicode_READY(self) == -1)
13804 return NULL;
13805 if (PyUnicode_IS_ASCII(self))
13806 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013807 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013808}
13809
INADA Naoki3ae20562017-01-16 20:41:20 +090013810/*[clinic input]
13811str.zfill as unicode_zfill
13812
13813 width: Py_ssize_t
13814 /
13815
13816Pad a numeric string with zeros on the left, to fill a field of the given width.
13817
13818The string is never truncated.
13819[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820
13821static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013822unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013823/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013824{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013825 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013826 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013827 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013828 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013829 Py_UCS4 chr;
13830
Benjamin Petersonbac79492012-01-14 13:34:47 -050013831 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013832 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013833
Victor Stinnerc4b49542011-12-11 22:44:26 +010013834 if (PyUnicode_GET_LENGTH(self) >= width)
13835 return unicode_result_unchanged(self);
13836
13837 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013838
13839 u = pad(self, fill, 0, '0');
13840
Walter Dörwald068325e2002-04-15 13:36:47 +000013841 if (u == NULL)
13842 return NULL;
13843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013844 kind = PyUnicode_KIND(u);
13845 data = PyUnicode_DATA(u);
13846 chr = PyUnicode_READ(kind, data, fill);
13847
13848 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013849 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013850 PyUnicode_WRITE(kind, data, 0, chr);
13851 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013852 }
13853
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013854 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013855 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013856}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013857
13858#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013859static PyObject *
13860unicode__decimal2ascii(PyObject *self)
13861{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013862 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013863}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013864#endif
13865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013866PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013867 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013868\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013869Return True if S starts with the specified prefix, False otherwise.\n\
13870With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013871With optional end, stop comparing S at that position.\n\
13872prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013873
13874static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013875unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013876 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013877{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013878 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013879 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013880 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013881 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013882 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013883
Jesus Ceaac451502011-04-20 17:09:23 +020013884 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013885 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013886 if (PyTuple_Check(subobj)) {
13887 Py_ssize_t i;
13888 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013889 substring = PyTuple_GET_ITEM(subobj, i);
13890 if (!PyUnicode_Check(substring)) {
13891 PyErr_Format(PyExc_TypeError,
13892 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013893 "not %.100s",
13894 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013895 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013896 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013897 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013898 if (result == -1)
13899 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013900 if (result) {
13901 Py_RETURN_TRUE;
13902 }
13903 }
13904 /* nothing matched */
13905 Py_RETURN_FALSE;
13906 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013907 if (!PyUnicode_Check(subobj)) {
13908 PyErr_Format(PyExc_TypeError,
13909 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013910 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013911 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013912 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013913 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013914 if (result == -1)
13915 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013916 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013917}
13918
13919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013920PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013923Return True if S ends with the specified suffix, False otherwise.\n\
13924With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013925With optional end, stop comparing S at that position.\n\
13926suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013927
13928static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013929unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013930 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013931{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013932 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013933 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013934 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013935 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013936 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013937
Jesus Ceaac451502011-04-20 17:09:23 +020013938 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013939 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013940 if (PyTuple_Check(subobj)) {
13941 Py_ssize_t i;
13942 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013943 substring = PyTuple_GET_ITEM(subobj, i);
13944 if (!PyUnicode_Check(substring)) {
13945 PyErr_Format(PyExc_TypeError,
13946 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013947 "not %.100s",
13948 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013949 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013950 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013951 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013952 if (result == -1)
13953 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013954 if (result) {
13955 Py_RETURN_TRUE;
13956 }
13957 }
13958 Py_RETURN_FALSE;
13959 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013960 if (!PyUnicode_Check(subobj)) {
13961 PyErr_Format(PyExc_TypeError,
13962 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013963 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013964 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013965 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013966 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013967 if (result == -1)
13968 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013969 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013970}
13971
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013972static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013973_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013974{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013975 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13976 writer->data = PyUnicode_DATA(writer->buffer);
13977
13978 if (!writer->readonly) {
13979 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013980 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013981 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013982 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013983 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13984 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13985 writer->kind = PyUnicode_WCHAR_KIND;
13986 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13987
Victor Stinner8f674cc2013-04-17 23:02:17 +020013988 /* Copy-on-write mode: set buffer size to 0 so
13989 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13990 * next write. */
13991 writer->size = 0;
13992 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013993}
13994
Victor Stinnerd3f08822012-05-29 12:57:52 +020013995void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013996_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013997{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013998 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013999
14000 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020014001 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020014002
14003 /* use a value smaller than PyUnicode_1BYTE_KIND() so
14004 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14005 writer->kind = PyUnicode_WCHAR_KIND;
14006 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020014007}
14008
Inada Naoki770847a2019-06-24 12:30:24 +090014009// Initialize _PyUnicodeWriter with initial buffer
14010static inline void
14011_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
14012{
14013 memset(writer, 0, sizeof(*writer));
14014 writer->buffer = buffer;
14015 _PyUnicodeWriter_Update(writer);
14016 writer->min_length = writer->size;
14017}
14018
Victor Stinnerd3f08822012-05-29 12:57:52 +020014019int
14020_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
14021 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020014022{
14023 Py_ssize_t newlen;
14024 PyObject *newbuffer;
14025
Victor Stinner2740e462016-09-06 16:58:36 -070014026 assert(maxchar <= MAX_UNICODE);
14027
Victor Stinnerca9381e2015-09-22 00:58:32 +020014028 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020014029 assert((maxchar > writer->maxchar && length >= 0)
14030 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014031
Victor Stinner202fdca2012-05-07 12:47:02 +020014032 if (length > PY_SSIZE_T_MAX - writer->pos) {
14033 PyErr_NoMemory();
14034 return -1;
14035 }
14036 newlen = writer->pos + length;
14037
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014038 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020014039
Victor Stinnerd3f08822012-05-29 12:57:52 +020014040 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020014041 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010014042 if (writer->overallocate
14043 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14044 /* overallocate to limit the number of realloc() */
14045 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014046 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014047 if (newlen < writer->min_length)
14048 newlen = writer->min_length;
14049
Victor Stinnerd3f08822012-05-29 12:57:52 +020014050 writer->buffer = PyUnicode_New(newlen, maxchar);
14051 if (writer->buffer == NULL)
14052 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014053 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014054 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010014055 if (writer->overallocate
14056 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14057 /* overallocate to limit the number of realloc() */
14058 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014059 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014060 if (newlen < writer->min_length)
14061 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014062
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014063 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020014064 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030014065 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020014066 newbuffer = PyUnicode_New(newlen, maxchar);
14067 if (newbuffer == NULL)
14068 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014069 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14070 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020014071 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014072 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020014073 }
14074 else {
14075 newbuffer = resize_compact(writer->buffer, newlen);
14076 if (newbuffer == NULL)
14077 return -1;
14078 }
14079 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014080 }
14081 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014082 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014083 newbuffer = PyUnicode_New(writer->size, maxchar);
14084 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014085 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014086 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14087 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014088 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014089 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014090 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014091 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014092
14093#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014094}
14095
Victor Stinnerca9381e2015-09-22 00:58:32 +020014096int
14097_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14098 enum PyUnicode_Kind kind)
14099{
14100 Py_UCS4 maxchar;
14101
14102 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14103 assert(writer->kind < kind);
14104
14105 switch (kind)
14106 {
14107 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14108 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14109 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14110 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014111 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014112 }
14113
14114 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14115}
14116
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014117static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014118_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014119{
Victor Stinner2740e462016-09-06 16:58:36 -070014120 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014121 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14122 return -1;
14123 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14124 writer->pos++;
14125 return 0;
14126}
14127
14128int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014129_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14130{
14131 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14132}
14133
14134int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014135_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14136{
14137 Py_UCS4 maxchar;
14138 Py_ssize_t len;
14139
14140 if (PyUnicode_READY(str) == -1)
14141 return -1;
14142 len = PyUnicode_GET_LENGTH(str);
14143 if (len == 0)
14144 return 0;
14145 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14146 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014147 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014148 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014149 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014150 Py_INCREF(str);
14151 writer->buffer = str;
14152 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014153 writer->pos += len;
14154 return 0;
14155 }
14156 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14157 return -1;
14158 }
14159 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14160 str, 0, len);
14161 writer->pos += len;
14162 return 0;
14163}
14164
Victor Stinnere215d962012-10-06 23:03:36 +020014165int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014166_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14167 Py_ssize_t start, Py_ssize_t end)
14168{
14169 Py_UCS4 maxchar;
14170 Py_ssize_t len;
14171
14172 if (PyUnicode_READY(str) == -1)
14173 return -1;
14174
14175 assert(0 <= start);
14176 assert(end <= PyUnicode_GET_LENGTH(str));
14177 assert(start <= end);
14178
14179 if (end == 0)
14180 return 0;
14181
14182 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14183 return _PyUnicodeWriter_WriteStr(writer, str);
14184
14185 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14186 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14187 else
14188 maxchar = writer->maxchar;
14189 len = end - start;
14190
14191 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14192 return -1;
14193
14194 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14195 str, start, len);
14196 writer->pos += len;
14197 return 0;
14198}
14199
14200int
Victor Stinner4a587072013-11-19 12:54:53 +010014201_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14202 const char *ascii, Py_ssize_t len)
14203{
14204 if (len == -1)
14205 len = strlen(ascii);
14206
Andy Lestere6be9b52020-02-11 20:28:35 -060014207 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014208
14209 if (writer->buffer == NULL && !writer->overallocate) {
14210 PyObject *str;
14211
14212 str = _PyUnicode_FromASCII(ascii, len);
14213 if (str == NULL)
14214 return -1;
14215
14216 writer->readonly = 1;
14217 writer->buffer = str;
14218 _PyUnicodeWriter_Update(writer);
14219 writer->pos += len;
14220 return 0;
14221 }
14222
14223 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14224 return -1;
14225
14226 switch (writer->kind)
14227 {
14228 case PyUnicode_1BYTE_KIND:
14229 {
14230 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14231 Py_UCS1 *data = writer->data;
14232
Christian Heimesf051e432016-09-13 20:22:02 +020014233 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014234 break;
14235 }
14236 case PyUnicode_2BYTE_KIND:
14237 {
14238 _PyUnicode_CONVERT_BYTES(
14239 Py_UCS1, Py_UCS2,
14240 ascii, ascii + len,
14241 (Py_UCS2 *)writer->data + writer->pos);
14242 break;
14243 }
14244 case PyUnicode_4BYTE_KIND:
14245 {
14246 _PyUnicode_CONVERT_BYTES(
14247 Py_UCS1, Py_UCS4,
14248 ascii, ascii + len,
14249 (Py_UCS4 *)writer->data + writer->pos);
14250 break;
14251 }
14252 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014253 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014254 }
14255
14256 writer->pos += len;
14257 return 0;
14258}
14259
14260int
14261_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14262 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014263{
14264 Py_UCS4 maxchar;
14265
Andy Lestere6be9b52020-02-11 20:28:35 -060014266 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014267 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14268 return -1;
14269 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14270 writer->pos += len;
14271 return 0;
14272}
14273
Victor Stinnerd3f08822012-05-29 12:57:52 +020014274PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014275_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014276{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014277 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014278
Victor Stinnerd3f08822012-05-29 12:57:52 +020014279 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014280 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014281 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014282 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014283
14284 str = writer->buffer;
14285 writer->buffer = NULL;
14286
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014287 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014288 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14289 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014290 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014291
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014292 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14293 PyObject *str2;
14294 str2 = resize_compact(str, writer->pos);
14295 if (str2 == NULL) {
14296 Py_DECREF(str);
14297 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014298 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014299 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014300 }
14301
Victor Stinner15a0bd32013-07-08 22:29:55 +020014302 assert(_PyUnicode_CheckConsistency(str, 1));
14303 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014304}
14305
Victor Stinnerd3f08822012-05-29 12:57:52 +020014306void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014307_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014308{
14309 Py_CLEAR(writer->buffer);
14310}
14311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014312#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014313
14314PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014315 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014316\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014317Return a formatted version of S, using substitutions from args and kwargs.\n\
14318The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014319
Eric Smith27bbca62010-11-04 17:06:58 +000014320PyDoc_STRVAR(format_map__doc__,
14321 "S.format_map(mapping) -> str\n\
14322\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014323Return a formatted version of S, using substitutions from mapping.\n\
14324The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014325
INADA Naoki3ae20562017-01-16 20:41:20 +090014326/*[clinic input]
14327str.__format__ as unicode___format__
14328
14329 format_spec: unicode
14330 /
14331
14332Return a formatted version of the string as described by format_spec.
14333[clinic start generated code]*/
14334
Eric Smith4a7d76d2008-05-30 18:10:19 +000014335static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014336unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014337/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014338{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014339 _PyUnicodeWriter writer;
14340 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014341
Victor Stinnerd3f08822012-05-29 12:57:52 +020014342 if (PyUnicode_READY(self) == -1)
14343 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014344 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014345 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14346 self, format_spec, 0,
14347 PyUnicode_GET_LENGTH(format_spec));
14348 if (ret == -1) {
14349 _PyUnicodeWriter_Dealloc(&writer);
14350 return NULL;
14351 }
14352 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014353}
14354
INADA Naoki3ae20562017-01-16 20:41:20 +090014355/*[clinic input]
14356str.__sizeof__ as unicode_sizeof
14357
14358Return the size of the string in memory, in bytes.
14359[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014360
14361static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014362unicode_sizeof_impl(PyObject *self)
14363/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014365 Py_ssize_t size;
14366
14367 /* If it's a compact object, account for base structure +
14368 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014369 if (PyUnicode_IS_COMPACT_ASCII(self))
14370 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14371 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014372 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014373 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014374 else {
14375 /* If it is a two-block object, account for base object, and
14376 for character block if present. */
14377 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014378 if (_PyUnicode_DATA_ANY(self))
14379 size += (PyUnicode_GET_LENGTH(self) + 1) *
14380 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014381 }
14382 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014383 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014384 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14385 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14386 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14387 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014388
14389 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014390}
14391
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014392static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014393unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014394{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014395 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014396 if (!copy)
14397 return NULL;
14398 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014399}
14400
Guido van Rossumd57fd912000-03-10 22:53:23 +000014401static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014402 UNICODE_ENCODE_METHODDEF
14403 UNICODE_REPLACE_METHODDEF
14404 UNICODE_SPLIT_METHODDEF
14405 UNICODE_RSPLIT_METHODDEF
14406 UNICODE_JOIN_METHODDEF
14407 UNICODE_CAPITALIZE_METHODDEF
14408 UNICODE_CASEFOLD_METHODDEF
14409 UNICODE_TITLE_METHODDEF
14410 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014411 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014412 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014413 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014414 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014415 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014416 UNICODE_LJUST_METHODDEF
14417 UNICODE_LOWER_METHODDEF
14418 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014419 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14420 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014421 UNICODE_RJUST_METHODDEF
14422 UNICODE_RSTRIP_METHODDEF
14423 UNICODE_RPARTITION_METHODDEF
14424 UNICODE_SPLITLINES_METHODDEF
14425 UNICODE_STRIP_METHODDEF
14426 UNICODE_SWAPCASE_METHODDEF
14427 UNICODE_TRANSLATE_METHODDEF
14428 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014429 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14430 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014431 UNICODE_REMOVEPREFIX_METHODDEF
14432 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014433 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014434 UNICODE_ISLOWER_METHODDEF
14435 UNICODE_ISUPPER_METHODDEF
14436 UNICODE_ISTITLE_METHODDEF
14437 UNICODE_ISSPACE_METHODDEF
14438 UNICODE_ISDECIMAL_METHODDEF
14439 UNICODE_ISDIGIT_METHODDEF
14440 UNICODE_ISNUMERIC_METHODDEF
14441 UNICODE_ISALPHA_METHODDEF
14442 UNICODE_ISALNUM_METHODDEF
14443 UNICODE_ISIDENTIFIER_METHODDEF
14444 UNICODE_ISPRINTABLE_METHODDEF
14445 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014446 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014447 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014448 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014449 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014450 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014451#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014452 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014453 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014454#endif
14455
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014456 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014457 {NULL, NULL}
14458};
14459
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014460static PyObject *
14461unicode_mod(PyObject *v, PyObject *w)
14462{
Brian Curtindfc80e32011-08-10 20:28:54 -050014463 if (!PyUnicode_Check(v))
14464 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014465 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014466}
14467
14468static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014469 0, /*nb_add*/
14470 0, /*nb_subtract*/
14471 0, /*nb_multiply*/
14472 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014473};
14474
Guido van Rossumd57fd912000-03-10 22:53:23 +000014475static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014476 (lenfunc) unicode_length, /* sq_length */
14477 PyUnicode_Concat, /* sq_concat */
14478 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14479 (ssizeargfunc) unicode_getitem, /* sq_item */
14480 0, /* sq_slice */
14481 0, /* sq_ass_item */
14482 0, /* sq_ass_slice */
14483 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014484};
14485
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014486static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014487unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014489 if (PyUnicode_READY(self) == -1)
14490 return NULL;
14491
Victor Stinnera15e2602020-04-08 02:01:56 +020014492 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014493 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014494 if (i == -1 && PyErr_Occurred())
14495 return NULL;
14496 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014497 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014498 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014499 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014500 Py_ssize_t start, stop, step, slicelength, i;
14501 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014502 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014503 const void *src_data;
14504 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014505 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014506 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014507
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014508 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014509 return NULL;
14510 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014511 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14512 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014513
14514 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014515 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014516 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014517 slicelength == PyUnicode_GET_LENGTH(self)) {
14518 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014519 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014520 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014521 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014522 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014523 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014524 src_kind = PyUnicode_KIND(self);
14525 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014526 if (!PyUnicode_IS_ASCII(self)) {
14527 kind_limit = kind_maxchar_limit(src_kind);
14528 max_char = 0;
14529 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14530 ch = PyUnicode_READ(src_kind, src_data, cur);
14531 if (ch > max_char) {
14532 max_char = ch;
14533 if (max_char >= kind_limit)
14534 break;
14535 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014536 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014537 }
Victor Stinner55c99112011-10-13 01:17:06 +020014538 else
14539 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014540 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014541 if (result == NULL)
14542 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014543 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014544 dest_data = PyUnicode_DATA(result);
14545
14546 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014547 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14548 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014549 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014550 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014551 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014552 } else {
14553 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14554 return NULL;
14555 }
14556}
14557
14558static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014559 (lenfunc)unicode_length, /* mp_length */
14560 (binaryfunc)unicode_subscript, /* mp_subscript */
14561 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014562};
14563
Guido van Rossumd57fd912000-03-10 22:53:23 +000014564
Guido van Rossumd57fd912000-03-10 22:53:23 +000014565/* Helpers for PyUnicode_Format() */
14566
Victor Stinnera47082312012-10-04 02:19:54 +020014567struct unicode_formatter_t {
14568 PyObject *args;
14569 int args_owned;
14570 Py_ssize_t arglen, argidx;
14571 PyObject *dict;
14572
14573 enum PyUnicode_Kind fmtkind;
14574 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014575 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014576 PyObject *fmtstr;
14577
14578 _PyUnicodeWriter writer;
14579};
14580
14581struct unicode_format_arg_t {
14582 Py_UCS4 ch;
14583 int flags;
14584 Py_ssize_t width;
14585 int prec;
14586 int sign;
14587};
14588
Guido van Rossumd57fd912000-03-10 22:53:23 +000014589static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014590unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014591{
Victor Stinnera47082312012-10-04 02:19:54 +020014592 Py_ssize_t argidx = ctx->argidx;
14593
14594 if (argidx < ctx->arglen) {
14595 ctx->argidx++;
14596 if (ctx->arglen < 0)
14597 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014598 else
Victor Stinnera47082312012-10-04 02:19:54 +020014599 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014600 }
14601 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014602 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014603 return NULL;
14604}
14605
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014606/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014607
Victor Stinnera47082312012-10-04 02:19:54 +020014608/* Format a float into the writer if the writer is not NULL, or into *p_output
14609 otherwise.
14610
14611 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014612static int
Victor Stinnera47082312012-10-04 02:19:54 +020014613formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14614 PyObject **p_output,
14615 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014616{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014617 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014618 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014619 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014620 int prec;
14621 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014622
Guido van Rossumd57fd912000-03-10 22:53:23 +000014623 x = PyFloat_AsDouble(v);
14624 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014625 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014626
Victor Stinnera47082312012-10-04 02:19:54 +020014627 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014628 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014629 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014630
Victor Stinnera47082312012-10-04 02:19:54 +020014631 if (arg->flags & F_ALT)
14632 dtoa_flags = Py_DTSF_ALT;
14633 else
14634 dtoa_flags = 0;
14635 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014636 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014637 return -1;
14638 len = strlen(p);
14639 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014640 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014641 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014642 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014643 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014644 }
14645 else
14646 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014647 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014648 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014649}
14650
Victor Stinnerd0880d52012-04-27 23:40:13 +020014651/* formatlong() emulates the format codes d, u, o, x and X, and
14652 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14653 * Python's regular ints.
14654 * Return value: a new PyUnicodeObject*, or NULL if error.
14655 * The output string is of the form
14656 * "-"? ("0x" | "0X")? digit+
14657 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14658 * set in flags. The case of hex digits will be correct,
14659 * There will be at least prec digits, zero-filled on the left if
14660 * necessary to get that many.
14661 * val object to be converted
14662 * flags bitmask of format flags; only F_ALT is looked at
14663 * prec minimum number of digits; 0-fill on left if needed
14664 * type a character in [duoxX]; u acts the same as d
14665 *
14666 * CAUTION: o, x and X conversions on regular ints can never
14667 * produce a '-' sign, but can for Python's unbounded ints.
14668 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014669PyObject *
14670_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014671{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014672 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014673 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014674 Py_ssize_t i;
14675 int sign; /* 1 if '-', else 0 */
14676 int len; /* number of characters */
14677 Py_ssize_t llen;
14678 int numdigits; /* len == numnondigits + numdigits */
14679 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014680
Victor Stinnerd0880d52012-04-27 23:40:13 +020014681 /* Avoid exceeding SSIZE_T_MAX */
14682 if (prec > INT_MAX-3) {
14683 PyErr_SetString(PyExc_OverflowError,
14684 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014685 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014686 }
14687
14688 assert(PyLong_Check(val));
14689
14690 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014691 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014692 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014693 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014694 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014695 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014696 /* int and int subclasses should print numerically when a numeric */
14697 /* format code is used (see issue18780) */
14698 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014699 break;
14700 case 'o':
14701 numnondigits = 2;
14702 result = PyNumber_ToBase(val, 8);
14703 break;
14704 case 'x':
14705 case 'X':
14706 numnondigits = 2;
14707 result = PyNumber_ToBase(val, 16);
14708 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014709 }
14710 if (!result)
14711 return NULL;
14712
14713 assert(unicode_modifiable(result));
14714 assert(PyUnicode_IS_READY(result));
14715 assert(PyUnicode_IS_ASCII(result));
14716
14717 /* To modify the string in-place, there can only be one reference. */
14718 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014719 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014720 PyErr_BadInternalCall();
14721 return NULL;
14722 }
14723 buf = PyUnicode_DATA(result);
14724 llen = PyUnicode_GET_LENGTH(result);
14725 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014726 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014727 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014728 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014729 return NULL;
14730 }
14731 len = (int)llen;
14732 sign = buf[0] == '-';
14733 numnondigits += sign;
14734 numdigits = len - numnondigits;
14735 assert(numdigits > 0);
14736
14737 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014738 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014739 (type == 'o' || type == 'x' || type == 'X'))) {
14740 assert(buf[sign] == '0');
14741 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14742 buf[sign+1] == 'o');
14743 numnondigits -= 2;
14744 buf += 2;
14745 len -= 2;
14746 if (sign)
14747 buf[0] = '-';
14748 assert(len == numnondigits + numdigits);
14749 assert(numdigits > 0);
14750 }
14751
14752 /* Fill with leading zeroes to meet minimum width. */
14753 if (prec > numdigits) {
14754 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14755 numnondigits + prec);
14756 char *b1;
14757 if (!r1) {
14758 Py_DECREF(result);
14759 return NULL;
14760 }
14761 b1 = PyBytes_AS_STRING(r1);
14762 for (i = 0; i < numnondigits; ++i)
14763 *b1++ = *buf++;
14764 for (i = 0; i < prec - numdigits; i++)
14765 *b1++ = '0';
14766 for (i = 0; i < numdigits; i++)
14767 *b1++ = *buf++;
14768 *b1 = '\0';
14769 Py_DECREF(result);
14770 result = r1;
14771 buf = PyBytes_AS_STRING(result);
14772 len = numnondigits + prec;
14773 }
14774
14775 /* Fix up case for hex conversions. */
14776 if (type == 'X') {
14777 /* Need to convert all lower case letters to upper case.
14778 and need to convert 0x to 0X (and -0x to -0X). */
14779 for (i = 0; i < len; i++)
14780 if (buf[i] >= 'a' && buf[i] <= 'x')
14781 buf[i] -= 'a'-'A';
14782 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014783 if (!PyUnicode_Check(result)
14784 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014785 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014786 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014787 Py_DECREF(result);
14788 result = unicode;
14789 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014790 else if (len != PyUnicode_GET_LENGTH(result)) {
14791 if (PyUnicode_Resize(&result, len) < 0)
14792 Py_CLEAR(result);
14793 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014794 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014795}
14796
Ethan Furmandf3ed242014-01-05 06:50:30 -080014797/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014798 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014799 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014800 * -1 and raise an exception on error */
14801static int
Victor Stinnera47082312012-10-04 02:19:54 +020014802mainformatlong(PyObject *v,
14803 struct unicode_format_arg_t *arg,
14804 PyObject **p_output,
14805 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014806{
14807 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014808 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014809
14810 if (!PyNumber_Check(v))
14811 goto wrongtype;
14812
Ethan Furman9ab74802014-03-21 06:38:46 -070014813 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014814 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014815 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014816 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014817 }
14818 else {
14819 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014820 }
14821 if (iobj == NULL ) {
14822 if (PyErr_ExceptionMatches(PyExc_TypeError))
14823 goto wrongtype;
14824 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014825 }
14826 assert(PyLong_Check(iobj));
14827 }
14828 else {
14829 iobj = v;
14830 Py_INCREF(iobj);
14831 }
14832
14833 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014834 && arg->width == -1 && arg->prec == -1
14835 && !(arg->flags & (F_SIGN | F_BLANK))
14836 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014837 {
14838 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014839 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014840 int base;
14841
Victor Stinnera47082312012-10-04 02:19:54 +020014842 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014843 {
14844 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014845 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014846 case 'd':
14847 case 'i':
14848 case 'u':
14849 base = 10;
14850 break;
14851 case 'o':
14852 base = 8;
14853 break;
14854 case 'x':
14855 case 'X':
14856 base = 16;
14857 break;
14858 }
14859
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014860 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14861 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014862 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014863 }
14864 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014865 return 1;
14866 }
14867
Ethan Furmanb95b5612015-01-23 20:05:18 -080014868 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014869 Py_DECREF(iobj);
14870 if (res == NULL)
14871 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014872 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014873 return 0;
14874
14875wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014876 switch(type)
14877 {
14878 case 'o':
14879 case 'x':
14880 case 'X':
14881 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014882 "%%%c format: an integer is required, "
14883 "not %.200s",
14884 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014885 break;
14886 default:
14887 PyErr_Format(PyExc_TypeError,
Serhiy Storchakae2ec0b22020-10-09 14:14:37 +030014888 "%%%c format: a real number is required, "
Victor Stinner998b8062018-09-12 00:23:25 +020014889 "not %.200s",
14890 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014891 break;
14892 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014893 return -1;
14894}
14895
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014896static Py_UCS4
14897formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014898{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014899 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014900 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014901 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014902 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014903 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014904 goto onError;
14905 }
14906 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014907 int overflow;
14908 long x = PyLong_AsLongAndOverflow(v, &overflow);
14909 if (x == -1 && PyErr_Occurred()) {
14910 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014911 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014912 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014913 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014914 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014915
Victor Stinner8faf8212011-12-08 22:14:11 +010014916 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014917 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014918 PyErr_SetString(PyExc_OverflowError,
14919 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014920 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014921 }
14922
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014923 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014924 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014925
Benjamin Peterson29060642009-01-31 22:14:21 +000014926 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014927 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014928 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014929 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014930}
14931
Victor Stinnera47082312012-10-04 02:19:54 +020014932/* Parse options of an argument: flags, width, precision.
14933 Handle also "%(name)" syntax.
14934
14935 Return 0 if the argument has been formatted into arg->str.
14936 Return 1 if the argument has been written into ctx->writer,
14937 Raise an exception and return -1 on error. */
14938static int
14939unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14940 struct unicode_format_arg_t *arg)
14941{
14942#define FORMAT_READ(ctx) \
14943 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14944
14945 PyObject *v;
14946
Victor Stinnera47082312012-10-04 02:19:54 +020014947 if (arg->ch == '(') {
14948 /* Get argument value from a dictionary. Example: "%(name)s". */
14949 Py_ssize_t keystart;
14950 Py_ssize_t keylen;
14951 PyObject *key;
14952 int pcount = 1;
14953
14954 if (ctx->dict == NULL) {
14955 PyErr_SetString(PyExc_TypeError,
14956 "format requires a mapping");
14957 return -1;
14958 }
14959 ++ctx->fmtpos;
14960 --ctx->fmtcnt;
14961 keystart = ctx->fmtpos;
14962 /* Skip over balanced parentheses */
14963 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14964 arg->ch = FORMAT_READ(ctx);
14965 if (arg->ch == ')')
14966 --pcount;
14967 else if (arg->ch == '(')
14968 ++pcount;
14969 ctx->fmtpos++;
14970 }
14971 keylen = ctx->fmtpos - keystart - 1;
14972 if (ctx->fmtcnt < 0 || pcount > 0) {
14973 PyErr_SetString(PyExc_ValueError,
14974 "incomplete format key");
14975 return -1;
14976 }
14977 key = PyUnicode_Substring(ctx->fmtstr,
14978 keystart, keystart + keylen);
14979 if (key == NULL)
14980 return -1;
14981 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014982 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014983 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014984 }
14985 ctx->args = PyObject_GetItem(ctx->dict, key);
14986 Py_DECREF(key);
14987 if (ctx->args == NULL)
14988 return -1;
14989 ctx->args_owned = 1;
14990 ctx->arglen = -1;
14991 ctx->argidx = -2;
14992 }
14993
14994 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014995 while (--ctx->fmtcnt >= 0) {
14996 arg->ch = FORMAT_READ(ctx);
14997 ctx->fmtpos++;
14998 switch (arg->ch) {
14999 case '-': arg->flags |= F_LJUST; continue;
15000 case '+': arg->flags |= F_SIGN; continue;
15001 case ' ': arg->flags |= F_BLANK; continue;
15002 case '#': arg->flags |= F_ALT; continue;
15003 case '0': arg->flags |= F_ZERO; continue;
15004 }
15005 break;
15006 }
15007
15008 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020015009 if (arg->ch == '*') {
15010 v = unicode_format_getnextarg(ctx);
15011 if (v == NULL)
15012 return -1;
15013 if (!PyLong_Check(v)) {
15014 PyErr_SetString(PyExc_TypeError,
15015 "* wants int");
15016 return -1;
15017 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015018 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015019 if (arg->width == -1 && PyErr_Occurred())
15020 return -1;
15021 if (arg->width < 0) {
15022 arg->flags |= F_LJUST;
15023 arg->width = -arg->width;
15024 }
15025 if (--ctx->fmtcnt >= 0) {
15026 arg->ch = FORMAT_READ(ctx);
15027 ctx->fmtpos++;
15028 }
15029 }
15030 else if (arg->ch >= '0' && arg->ch <= '9') {
15031 arg->width = arg->ch - '0';
15032 while (--ctx->fmtcnt >= 0) {
15033 arg->ch = FORMAT_READ(ctx);
15034 ctx->fmtpos++;
15035 if (arg->ch < '0' || arg->ch > '9')
15036 break;
15037 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15038 mixing signed and unsigned comparison. Since arg->ch is between
15039 '0' and '9', casting to int is safe. */
15040 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15041 PyErr_SetString(PyExc_ValueError,
15042 "width too big");
15043 return -1;
15044 }
15045 arg->width = arg->width*10 + (arg->ch - '0');
15046 }
15047 }
15048
15049 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020015050 if (arg->ch == '.') {
15051 arg->prec = 0;
15052 if (--ctx->fmtcnt >= 0) {
15053 arg->ch = FORMAT_READ(ctx);
15054 ctx->fmtpos++;
15055 }
15056 if (arg->ch == '*') {
15057 v = unicode_format_getnextarg(ctx);
15058 if (v == NULL)
15059 return -1;
15060 if (!PyLong_Check(v)) {
15061 PyErr_SetString(PyExc_TypeError,
15062 "* wants int");
15063 return -1;
15064 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015065 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015066 if (arg->prec == -1 && PyErr_Occurred())
15067 return -1;
15068 if (arg->prec < 0)
15069 arg->prec = 0;
15070 if (--ctx->fmtcnt >= 0) {
15071 arg->ch = FORMAT_READ(ctx);
15072 ctx->fmtpos++;
15073 }
15074 }
15075 else if (arg->ch >= '0' && arg->ch <= '9') {
15076 arg->prec = arg->ch - '0';
15077 while (--ctx->fmtcnt >= 0) {
15078 arg->ch = FORMAT_READ(ctx);
15079 ctx->fmtpos++;
15080 if (arg->ch < '0' || arg->ch > '9')
15081 break;
15082 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15083 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015084 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015085 return -1;
15086 }
15087 arg->prec = arg->prec*10 + (arg->ch - '0');
15088 }
15089 }
15090 }
15091
15092 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15093 if (ctx->fmtcnt >= 0) {
15094 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15095 if (--ctx->fmtcnt >= 0) {
15096 arg->ch = FORMAT_READ(ctx);
15097 ctx->fmtpos++;
15098 }
15099 }
15100 }
15101 if (ctx->fmtcnt < 0) {
15102 PyErr_SetString(PyExc_ValueError,
15103 "incomplete format");
15104 return -1;
15105 }
15106 return 0;
15107
15108#undef FORMAT_READ
15109}
15110
15111/* Format one argument. Supported conversion specifiers:
15112
15113 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015114 - "i", "d", "u": int or float
15115 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015116 - "e", "E", "f", "F", "g", "G": float
15117 - "c": int or str (1 character)
15118
Victor Stinner8dbd4212012-12-04 09:30:24 +010015119 When possible, the output is written directly into the Unicode writer
15120 (ctx->writer). A string is created when padding is required.
15121
Victor Stinnera47082312012-10-04 02:19:54 +020015122 Return 0 if the argument has been formatted into *p_str,
15123 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015124 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015125static int
15126unicode_format_arg_format(struct unicode_formatter_t *ctx,
15127 struct unicode_format_arg_t *arg,
15128 PyObject **p_str)
15129{
15130 PyObject *v;
15131 _PyUnicodeWriter *writer = &ctx->writer;
15132
15133 if (ctx->fmtcnt == 0)
15134 ctx->writer.overallocate = 0;
15135
Victor Stinnera47082312012-10-04 02:19:54 +020015136 v = unicode_format_getnextarg(ctx);
15137 if (v == NULL)
15138 return -1;
15139
Victor Stinnera47082312012-10-04 02:19:54 +020015140
15141 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015142 case 's':
15143 case 'r':
15144 case 'a':
15145 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15146 /* Fast path */
15147 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15148 return -1;
15149 return 1;
15150 }
15151
15152 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15153 *p_str = v;
15154 Py_INCREF(*p_str);
15155 }
15156 else {
15157 if (arg->ch == 's')
15158 *p_str = PyObject_Str(v);
15159 else if (arg->ch == 'r')
15160 *p_str = PyObject_Repr(v);
15161 else
15162 *p_str = PyObject_ASCII(v);
15163 }
15164 break;
15165
15166 case 'i':
15167 case 'd':
15168 case 'u':
15169 case 'o':
15170 case 'x':
15171 case 'X':
15172 {
15173 int ret = mainformatlong(v, arg, p_str, writer);
15174 if (ret != 0)
15175 return ret;
15176 arg->sign = 1;
15177 break;
15178 }
15179
15180 case 'e':
15181 case 'E':
15182 case 'f':
15183 case 'F':
15184 case 'g':
15185 case 'G':
15186 if (arg->width == -1 && arg->prec == -1
15187 && !(arg->flags & (F_SIGN | F_BLANK)))
15188 {
15189 /* Fast path */
15190 if (formatfloat(v, arg, NULL, writer) == -1)
15191 return -1;
15192 return 1;
15193 }
15194
15195 arg->sign = 1;
15196 if (formatfloat(v, arg, p_str, NULL) == -1)
15197 return -1;
15198 break;
15199
15200 case 'c':
15201 {
15202 Py_UCS4 ch = formatchar(v);
15203 if (ch == (Py_UCS4) -1)
15204 return -1;
15205 if (arg->width == -1 && arg->prec == -1) {
15206 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015207 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015208 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015209 return 1;
15210 }
15211 *p_str = PyUnicode_FromOrdinal(ch);
15212 break;
15213 }
15214
15215 default:
15216 PyErr_Format(PyExc_ValueError,
15217 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015218 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015219 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15220 (int)arg->ch,
15221 ctx->fmtpos - 1);
15222 return -1;
15223 }
15224 if (*p_str == NULL)
15225 return -1;
15226 assert (PyUnicode_Check(*p_str));
15227 return 0;
15228}
15229
15230static int
15231unicode_format_arg_output(struct unicode_formatter_t *ctx,
15232 struct unicode_format_arg_t *arg,
15233 PyObject *str)
15234{
15235 Py_ssize_t len;
15236 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015237 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015238 Py_ssize_t pindex;
15239 Py_UCS4 signchar;
15240 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015241 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015242 Py_ssize_t sublen;
15243 _PyUnicodeWriter *writer = &ctx->writer;
15244 Py_UCS4 fill;
15245
15246 fill = ' ';
15247 if (arg->sign && arg->flags & F_ZERO)
15248 fill = '0';
15249
15250 if (PyUnicode_READY(str) == -1)
15251 return -1;
15252
15253 len = PyUnicode_GET_LENGTH(str);
15254 if ((arg->width == -1 || arg->width <= len)
15255 && (arg->prec == -1 || arg->prec >= len)
15256 && !(arg->flags & (F_SIGN | F_BLANK)))
15257 {
15258 /* Fast path */
15259 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15260 return -1;
15261 return 0;
15262 }
15263
15264 /* Truncate the string for "s", "r" and "a" formats
15265 if the precision is set */
15266 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15267 if (arg->prec >= 0 && len > arg->prec)
15268 len = arg->prec;
15269 }
15270
15271 /* Adjust sign and width */
15272 kind = PyUnicode_KIND(str);
15273 pbuf = PyUnicode_DATA(str);
15274 pindex = 0;
15275 signchar = '\0';
15276 if (arg->sign) {
15277 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15278 if (ch == '-' || ch == '+') {
15279 signchar = ch;
15280 len--;
15281 pindex++;
15282 }
15283 else if (arg->flags & F_SIGN)
15284 signchar = '+';
15285 else if (arg->flags & F_BLANK)
15286 signchar = ' ';
15287 else
15288 arg->sign = 0;
15289 }
15290 if (arg->width < len)
15291 arg->width = len;
15292
15293 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015294 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015295 if (!(arg->flags & F_LJUST)) {
15296 if (arg->sign) {
15297 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015298 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015299 }
15300 else {
15301 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015302 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015303 }
15304 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015305 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15306 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015307 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015308 }
15309
Victor Stinnera47082312012-10-04 02:19:54 +020015310 buflen = arg->width;
15311 if (arg->sign && len == arg->width)
15312 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015313 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015314 return -1;
15315
15316 /* Write the sign if needed */
15317 if (arg->sign) {
15318 if (fill != ' ') {
15319 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15320 writer->pos += 1;
15321 }
15322 if (arg->width > len)
15323 arg->width--;
15324 }
15325
15326 /* Write the numeric prefix for "x", "X" and "o" formats
15327 if the alternate form is used.
15328 For example, write "0x" for the "%#x" format. */
15329 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15330 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15331 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15332 if (fill != ' ') {
15333 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15334 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15335 writer->pos += 2;
15336 pindex += 2;
15337 }
15338 arg->width -= 2;
15339 if (arg->width < 0)
15340 arg->width = 0;
15341 len -= 2;
15342 }
15343
15344 /* Pad left with the fill character if needed */
15345 if (arg->width > len && !(arg->flags & F_LJUST)) {
15346 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015347 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015348 writer->pos += sublen;
15349 arg->width = len;
15350 }
15351
15352 /* If padding with spaces: write sign if needed and/or numeric prefix if
15353 the alternate form is used */
15354 if (fill == ' ') {
15355 if (arg->sign) {
15356 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15357 writer->pos += 1;
15358 }
15359 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15360 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15361 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15362 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15363 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15364 writer->pos += 2;
15365 pindex += 2;
15366 }
15367 }
15368
15369 /* Write characters */
15370 if (len) {
15371 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15372 str, pindex, len);
15373 writer->pos += len;
15374 }
15375
15376 /* Pad right with the fill character if needed */
15377 if (arg->width > len) {
15378 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015379 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015380 writer->pos += sublen;
15381 }
15382 return 0;
15383}
15384
15385/* Helper of PyUnicode_Format(): format one arg.
15386 Return 0 on success, raise an exception and return -1 on error. */
15387static int
15388unicode_format_arg(struct unicode_formatter_t *ctx)
15389{
15390 struct unicode_format_arg_t arg;
15391 PyObject *str;
15392 int ret;
15393
Victor Stinner8dbd4212012-12-04 09:30:24 +010015394 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015395 if (arg.ch == '%') {
15396 ctx->fmtpos++;
15397 ctx->fmtcnt--;
15398 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15399 return -1;
15400 return 0;
15401 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015402 arg.flags = 0;
15403 arg.width = -1;
15404 arg.prec = -1;
15405 arg.sign = 0;
15406 str = NULL;
15407
Victor Stinnera47082312012-10-04 02:19:54 +020015408 ret = unicode_format_arg_parse(ctx, &arg);
15409 if (ret == -1)
15410 return -1;
15411
15412 ret = unicode_format_arg_format(ctx, &arg, &str);
15413 if (ret == -1)
15414 return -1;
15415
15416 if (ret != 1) {
15417 ret = unicode_format_arg_output(ctx, &arg, str);
15418 Py_DECREF(str);
15419 if (ret == -1)
15420 return -1;
15421 }
15422
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015423 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015424 PyErr_SetString(PyExc_TypeError,
15425 "not all arguments converted during string formatting");
15426 return -1;
15427 }
15428 return 0;
15429}
15430
Alexander Belopolsky40018472011-02-26 01:02:56 +000015431PyObject *
15432PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015433{
Victor Stinnera47082312012-10-04 02:19:54 +020015434 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015435
Guido van Rossumd57fd912000-03-10 22:53:23 +000015436 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015437 PyErr_BadInternalCall();
15438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015439 }
Victor Stinnera47082312012-10-04 02:19:54 +020015440
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015441 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015442 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015443
15444 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015445 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15446 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15447 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15448 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015449
Victor Stinner8f674cc2013-04-17 23:02:17 +020015450 _PyUnicodeWriter_Init(&ctx.writer);
15451 ctx.writer.min_length = ctx.fmtcnt + 100;
15452 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015453
Guido van Rossumd57fd912000-03-10 22:53:23 +000015454 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015455 ctx.arglen = PyTuple_Size(args);
15456 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015457 }
15458 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015459 ctx.arglen = -1;
15460 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015461 }
Victor Stinnera47082312012-10-04 02:19:54 +020015462 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015463 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015464 ctx.dict = args;
15465 else
15466 ctx.dict = NULL;
15467 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015468
Victor Stinnera47082312012-10-04 02:19:54 +020015469 while (--ctx.fmtcnt >= 0) {
15470 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015471 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015472
15473 nonfmtpos = ctx.fmtpos++;
15474 while (ctx.fmtcnt >= 0 &&
15475 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15476 ctx.fmtpos++;
15477 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015478 }
Victor Stinnera47082312012-10-04 02:19:54 +020015479 if (ctx.fmtcnt < 0) {
15480 ctx.fmtpos--;
15481 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015482 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015483
Victor Stinnercfc4c132013-04-03 01:48:39 +020015484 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15485 nonfmtpos, ctx.fmtpos) < 0)
15486 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015487 }
15488 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015489 ctx.fmtpos++;
15490 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015491 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015492 }
15493 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015494
Victor Stinnera47082312012-10-04 02:19:54 +020015495 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015496 PyErr_SetString(PyExc_TypeError,
15497 "not all arguments converted during string formatting");
15498 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015499 }
15500
Victor Stinnera47082312012-10-04 02:19:54 +020015501 if (ctx.args_owned) {
15502 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015503 }
Victor Stinnera47082312012-10-04 02:19:54 +020015504 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015505
Benjamin Peterson29060642009-01-31 22:14:21 +000015506 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015507 _PyUnicodeWriter_Dealloc(&ctx.writer);
15508 if (ctx.args_owned) {
15509 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015510 }
15511 return NULL;
15512}
15513
Jeremy Hylton938ace62002-07-17 16:30:39 +000015514static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015515unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15516
15517/*[clinic input]
15518@classmethod
15519str.__new__ as unicode_new
15520
15521 object as x: object = NULL
15522 encoding: str = NULL
15523 errors: str = NULL
15524
15525[clinic start generated code]*/
Guido van Rossume023fe02001-08-30 03:12:59 +000015526
Tim Peters6d6c1a32001-08-02 04:15:00 +000015527static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015528unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15529 const char *errors)
15530/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
Tim Peters6d6c1a32001-08-02 04:15:00 +000015531{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015532 PyObject *unicode;
15533 if (x == NULL) {
15534 unicode = unicode_new_empty();
15535 }
15536 else if (encoding == NULL && errors == NULL) {
15537 unicode = PyObject_Str(x);
15538 }
15539 else {
15540 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15541 }
Tim Peters6d6c1a32001-08-02 04:15:00 +000015542
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015543 if (unicode != NULL && type != &PyUnicode_Type) {
15544 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15545 }
15546 return unicode;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015547}
15548
Guido van Rossume023fe02001-08-30 03:12:59 +000015549static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015550unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
Guido van Rossume023fe02001-08-30 03:12:59 +000015551{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015552 PyObject *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015553 Py_ssize_t length, char_size;
15554 int share_wstr, share_utf8;
15555 unsigned int kind;
15556 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015557
Benjamin Peterson14339b62009-01-31 16:36:08 +000015558 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner910337b2011-10-03 03:20:16 +020015559 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015560 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015561 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015562 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015563
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015564 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015565 if (self == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015566 return NULL;
15567 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015568 kind = PyUnicode_KIND(unicode);
15569 length = PyUnicode_GET_LENGTH(unicode);
15570
15571 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015572#ifdef Py_DEBUG
15573 _PyUnicode_HASH(self) = -1;
15574#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015575 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015576#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015577 _PyUnicode_STATE(self).interned = 0;
15578 _PyUnicode_STATE(self).kind = kind;
15579 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015580 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015581 _PyUnicode_STATE(self).ready = 1;
15582 _PyUnicode_WSTR(self) = NULL;
15583 _PyUnicode_UTF8_LENGTH(self) = 0;
15584 _PyUnicode_UTF8(self) = NULL;
15585 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015586 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015587
15588 share_utf8 = 0;
15589 share_wstr = 0;
15590 if (kind == PyUnicode_1BYTE_KIND) {
15591 char_size = 1;
15592 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15593 share_utf8 = 1;
15594 }
15595 else if (kind == PyUnicode_2BYTE_KIND) {
15596 char_size = 2;
15597 if (sizeof(wchar_t) == 2)
15598 share_wstr = 1;
15599 }
15600 else {
15601 assert(kind == PyUnicode_4BYTE_KIND);
15602 char_size = 4;
15603 if (sizeof(wchar_t) == 4)
15604 share_wstr = 1;
15605 }
15606
15607 /* Ensure we won't overflow the length. */
15608 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15609 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015610 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015611 }
Victor Stinner32bd68c2020-12-01 10:37:39 +010015612 data = PyObject_Malloc((length + 1) * char_size);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015613 if (data == NULL) {
15614 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015615 goto onError;
15616 }
15617
Victor Stinnerc3c74152011-10-02 20:39:55 +020015618 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015619 if (share_utf8) {
15620 _PyUnicode_UTF8_LENGTH(self) = length;
15621 _PyUnicode_UTF8(self) = data;
15622 }
15623 if (share_wstr) {
15624 _PyUnicode_WSTR_LENGTH(self) = length;
15625 _PyUnicode_WSTR(self) = (wchar_t *)data;
15626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015627
Christian Heimesf051e432016-09-13 20:22:02 +020015628 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015629 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015630 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015631#ifdef Py_DEBUG
15632 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15633#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +010015634 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015635
15636onError:
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015637 Py_DECREF(self);
15638 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015639}
15640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015641PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015642"str(object='') -> str\n\
15643str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015644\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015645Create a new string object from the given object. If encoding or\n\
15646errors is specified, then the object must expose a data buffer\n\
15647that will be decoded using the given encoding and error handler.\n\
15648Otherwise, returns the result of object.__str__() (if defined)\n\
15649or repr(object).\n\
15650encoding defaults to sys.getdefaultencoding().\n\
15651errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015652
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015653static PyObject *unicode_iter(PyObject *seq);
15654
Guido van Rossumd57fd912000-03-10 22:53:23 +000015655PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015656 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015657 "str", /* tp_name */
15658 sizeof(PyUnicodeObject), /* tp_basicsize */
15659 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015660 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015661 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015662 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015663 0, /* tp_getattr */
15664 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015665 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015666 unicode_repr, /* tp_repr */
15667 &unicode_as_number, /* tp_as_number */
15668 &unicode_as_sequence, /* tp_as_sequence */
15669 &unicode_as_mapping, /* tp_as_mapping */
15670 (hashfunc) unicode_hash, /* tp_hash*/
15671 0, /* tp_call*/
15672 (reprfunc) unicode_str, /* tp_str */
15673 PyObject_GenericGetAttr, /* tp_getattro */
15674 0, /* tp_setattro */
15675 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015676 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015677 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15678 unicode_doc, /* tp_doc */
15679 0, /* tp_traverse */
15680 0, /* tp_clear */
15681 PyUnicode_RichCompare, /* tp_richcompare */
15682 0, /* tp_weaklistoffset */
15683 unicode_iter, /* tp_iter */
15684 0, /* tp_iternext */
15685 unicode_methods, /* tp_methods */
15686 0, /* tp_members */
15687 0, /* tp_getset */
15688 &PyBaseObject_Type, /* tp_base */
15689 0, /* tp_dict */
15690 0, /* tp_descr_get */
15691 0, /* tp_descr_set */
15692 0, /* tp_dictoffset */
15693 0, /* tp_init */
15694 0, /* tp_alloc */
15695 unicode_new, /* tp_new */
15696 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015697};
15698
15699/* Initialize the Unicode implementation */
15700
Victor Stinner331a6a52019-05-27 16:39:22 +020015701PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015702_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015703{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015704 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015705 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015706 0x000A, /* LINE FEED */
15707 0x000D, /* CARRIAGE RETURN */
15708 0x001C, /* FILE SEPARATOR */
15709 0x001D, /* GROUP SEPARATOR */
15710 0x001E, /* RECORD SEPARATOR */
15711 0x0085, /* NEXT LINE */
15712 0x2028, /* LINE SEPARATOR */
15713 0x2029, /* PARAGRAPH SEPARATOR */
15714 };
15715
Victor Stinner91698d82020-06-25 14:07:40 +020015716 struct _Py_unicode_state *state = &tstate->interp->unicode;
15717 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015718 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015719 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015720
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015721 if (_Py_IsMainInterpreter(tstate)) {
15722 /* initialize the linebreak bloom filter */
15723 bloom_linebreak = make_bloom_mask(
15724 PyUnicode_2BYTE_KIND, linebreak,
15725 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015726
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015727 if (PyType_Ready(&PyUnicode_Type) < 0) {
15728 return _PyStatus_ERR("Can't initialize unicode type");
15729 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015730
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015731 if (PyType_Ready(&EncodingMapType) < 0) {
15732 return _PyStatus_ERR("Can't initialize encoding map type");
15733 }
15734 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15735 return _PyStatus_ERR("Can't initialize field name iterator type");
15736 }
15737 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15738 return _PyStatus_ERR("Can't initialize formatter iter type");
15739 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015740 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015741 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015742}
15743
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015744
Walter Dörwald16807132007-05-25 13:52:07 +000015745void
15746PyUnicode_InternInPlace(PyObject **p)
15747{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015748 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015749#ifdef Py_DEBUG
15750 assert(s != NULL);
15751 assert(_PyUnicode_CHECK(s));
15752#else
Victor Stinner607b1022020-05-05 18:50:30 +020015753 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015754 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015755 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015756#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015757
Benjamin Peterson14339b62009-01-31 16:36:08 +000015758 /* If it's a subclass, we don't really know what putting
15759 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015760 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015761 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015762 }
15763
15764 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015765 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015766 }
15767
15768#ifdef INTERNED_STRINGS
Victor Stinner666ecfb2020-07-02 01:19:57 +020015769 if (PyUnicode_READY(s) == -1) {
15770 PyErr_Clear();
15771 return;
15772 }
15773
Benjamin Peterson14339b62009-01-31 16:36:08 +000015774 if (interned == NULL) {
15775 interned = PyDict_New();
15776 if (interned == NULL) {
15777 PyErr_Clear(); /* Don't leave an exception */
15778 return;
15779 }
15780 }
Victor Stinner607b1022020-05-05 18:50:30 +020015781
15782 PyObject *t;
Berker Peksagced8d4c2016-07-25 04:40:39 +030015783 t = PyDict_SetDefault(interned, s, s);
Victor Stinner607b1022020-05-05 18:50:30 +020015784
Berker Peksagced8d4c2016-07-25 04:40:39 +030015785 if (t == NULL) {
15786 PyErr_Clear();
15787 return;
15788 }
Victor Stinner607b1022020-05-05 18:50:30 +020015789
Berker Peksagced8d4c2016-07-25 04:40:39 +030015790 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015791 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015792 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015793 return;
15794 }
Victor Stinner607b1022020-05-05 18:50:30 +020015795
Victor Stinner3549ca32020-07-03 16:59:12 +020015796 /* The two references in interned dict (key and value) are not counted by
15797 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15798 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015799 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015800 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner7f413a52020-09-23 14:05:32 +020015801#else
15802 // PyDict expects that interned strings have their hash
15803 // (PyASCIIObject.hash) already computed.
15804 (void)unicode_hash(s);
Victor Stinner607b1022020-05-05 18:50:30 +020015805#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015806}
15807
15808void
15809PyUnicode_InternImmortal(PyObject **p)
15810{
Victor Stinner583ee5a2020-10-02 14:49:00 +020015811 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15812 "PyUnicode_InternImmortal() is deprecated; "
15813 "use PyUnicode_InternInPlace() instead", 1) < 0)
15814 {
15815 // The function has no return value, the exception cannot
15816 // be reported to the caller, so just log it.
15817 PyErr_WriteUnraisable(NULL);
15818 }
15819
Benjamin Peterson14339b62009-01-31 16:36:08 +000015820 PyUnicode_InternInPlace(p);
15821 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015822 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015823 Py_INCREF(*p);
15824 }
Walter Dörwald16807132007-05-25 13:52:07 +000015825}
15826
15827PyObject *
15828PyUnicode_InternFromString(const char *cp)
15829{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015830 PyObject *s = PyUnicode_FromString(cp);
15831 if (s == NULL)
15832 return NULL;
15833 PyUnicode_InternInPlace(&s);
15834 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015835}
15836
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015837
Victor Stinner666ecfb2020-07-02 01:19:57 +020015838void
15839_PyUnicode_ClearInterned(PyThreadState *tstate)
Walter Dörwald16807132007-05-25 13:52:07 +000015840{
Victor Stinner666ecfb2020-07-02 01:19:57 +020015841 if (!_Py_IsMainInterpreter(tstate)) {
15842 // interned dict is shared by all interpreters
Benjamin Peterson14339b62009-01-31 16:36:08 +000015843 return;
15844 }
Walter Dörwald16807132007-05-25 13:52:07 +000015845
Victor Stinner666ecfb2020-07-02 01:19:57 +020015846 if (interned == NULL) {
15847 return;
15848 }
15849 assert(PyDict_CheckExact(interned));
15850
15851 PyObject *keys = PyDict_Keys(interned);
15852 if (keys == NULL) {
15853 PyErr_Clear();
15854 return;
15855 }
15856 assert(PyList_CheckExact(keys));
15857
15858 /* Interned unicode strings are not forcibly deallocated; rather, we give
15859 them their stolen references back, and then clear and DECREF the
15860 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015861
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015862 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015863#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015864 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015865
15866 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015867#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015868 for (Py_ssize_t i = 0; i < n; i++) {
15869 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015870 assert(PyUnicode_IS_READY(s));
15871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015872 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015873 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015874 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015875#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015876 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015877#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015878 break;
15879 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015880 // Restore the two references (key and value) ignored
15881 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015882 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015883#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015884 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015885#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015886 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015887 case SSTATE_NOT_INTERNED:
15888 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015889 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015890 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015892 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015893 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015894#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015895 fprintf(stderr,
15896 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15897 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015898#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015899 Py_DECREF(keys);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015900
Benjamin Peterson14339b62009-01-31 16:36:08 +000015901 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015902 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015903}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015904
15905
15906/********************* Unicode Iterator **************************/
15907
15908typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015909 PyObject_HEAD
15910 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015911 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015912} unicodeiterobject;
15913
15914static void
15915unicodeiter_dealloc(unicodeiterobject *it)
15916{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015917 _PyObject_GC_UNTRACK(it);
15918 Py_XDECREF(it->it_seq);
15919 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015920}
15921
15922static int
15923unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15924{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015925 Py_VISIT(it->it_seq);
15926 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015927}
15928
15929static PyObject *
15930unicodeiter_next(unicodeiterobject *it)
15931{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015932 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015933
Benjamin Peterson14339b62009-01-31 16:36:08 +000015934 assert(it != NULL);
15935 seq = it->it_seq;
15936 if (seq == NULL)
15937 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015938 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015940 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15941 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015942 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015943 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15944 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015945 if (item != NULL)
15946 ++it->it_index;
15947 return item;
15948 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015949
Benjamin Peterson14339b62009-01-31 16:36:08 +000015950 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015951 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015952 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015953}
15954
15955static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015956unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015957{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015958 Py_ssize_t len = 0;
15959 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015960 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015961 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015962}
15963
15964PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15965
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015966static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015967unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015968{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015969 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015970 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015971 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015972 it->it_seq, it->it_index);
15973 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015974 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015975 if (u == NULL)
15976 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015977 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015978 }
15979}
15980
15981PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15982
15983static PyObject *
15984unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15985{
15986 Py_ssize_t index = PyLong_AsSsize_t(state);
15987 if (index == -1 && PyErr_Occurred())
15988 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015989 if (it->it_seq != NULL) {
15990 if (index < 0)
15991 index = 0;
15992 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15993 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15994 it->it_index = index;
15995 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015996 Py_RETURN_NONE;
15997}
15998
15999PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16000
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016001static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000016002 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000016003 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016004 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
16005 reduce_doc},
16006 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
16007 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000016008 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016009};
16010
16011PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000016012 PyVarObject_HEAD_INIT(&PyType_Type, 0)
16013 "str_iterator", /* tp_name */
16014 sizeof(unicodeiterobject), /* tp_basicsize */
16015 0, /* tp_itemsize */
16016 /* methods */
16017 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020016018 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000016019 0, /* tp_getattr */
16020 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020016021 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000016022 0, /* tp_repr */
16023 0, /* tp_as_number */
16024 0, /* tp_as_sequence */
16025 0, /* tp_as_mapping */
16026 0, /* tp_hash */
16027 0, /* tp_call */
16028 0, /* tp_str */
16029 PyObject_GenericGetAttr, /* tp_getattro */
16030 0, /* tp_setattro */
16031 0, /* tp_as_buffer */
16032 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16033 0, /* tp_doc */
16034 (traverseproc)unicodeiter_traverse, /* tp_traverse */
16035 0, /* tp_clear */
16036 0, /* tp_richcompare */
16037 0, /* tp_weaklistoffset */
16038 PyObject_SelfIter, /* tp_iter */
16039 (iternextfunc)unicodeiter_next, /* tp_iternext */
16040 unicodeiter_methods, /* tp_methods */
16041 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016042};
16043
16044static PyObject *
16045unicode_iter(PyObject *seq)
16046{
Benjamin Peterson14339b62009-01-31 16:36:08 +000016047 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016048
Benjamin Peterson14339b62009-01-31 16:36:08 +000016049 if (!PyUnicode_Check(seq)) {
16050 PyErr_BadInternalCall();
16051 return NULL;
16052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020016053 if (PyUnicode_READY(seq) == -1)
16054 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016055 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16056 if (it == NULL)
16057 return NULL;
16058 it->it_index = 0;
16059 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020016060 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016061 _PyObject_GC_TRACK(it);
16062 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016063}
16064
Victor Stinner709d23d2019-05-02 14:56:30 -040016065static int
16066encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016067{
Victor Stinner709d23d2019-05-02 14:56:30 -040016068 int res;
16069 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16070 if (res == -2) {
16071 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16072 return -1;
16073 }
16074 if (res < 0) {
16075 PyErr_NoMemory();
16076 return -1;
16077 }
16078 return 0;
16079}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016080
Victor Stinner709d23d2019-05-02 14:56:30 -040016081
16082static int
16083config_get_codec_name(wchar_t **config_encoding)
16084{
16085 char *encoding;
16086 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16087 return -1;
16088 }
16089
16090 PyObject *name_obj = NULL;
16091 PyObject *codec = _PyCodec_Lookup(encoding);
16092 PyMem_RawFree(encoding);
16093
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016094 if (!codec)
16095 goto error;
16096
16097 name_obj = PyObject_GetAttrString(codec, "name");
16098 Py_CLEAR(codec);
16099 if (!name_obj) {
16100 goto error;
16101 }
16102
Victor Stinner709d23d2019-05-02 14:56:30 -040016103 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16104 Py_DECREF(name_obj);
16105 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016106 goto error;
16107 }
16108
Victor Stinner709d23d2019-05-02 14:56:30 -040016109 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16110 if (raw_wname == NULL) {
16111 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016112 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016113 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016114 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016115
16116 PyMem_RawFree(*config_encoding);
16117 *config_encoding = raw_wname;
16118
16119 PyMem_Free(wname);
16120 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016121
16122error:
16123 Py_XDECREF(codec);
16124 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016125 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016126}
16127
16128
Victor Stinner331a6a52019-05-27 16:39:22 +020016129static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016130init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016131{
Victor Stinner709d23d2019-05-02 14:56:30 -040016132 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016133 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016134 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016135 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016136 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016137 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016138 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016139}
16140
16141
Victor Stinner709d23d2019-05-02 14:56:30 -040016142static int
16143init_fs_codec(PyInterpreterState *interp)
16144{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016145 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016146
16147 _Py_error_handler error_handler;
16148 error_handler = get_error_handler_wide(config->filesystem_errors);
16149 if (error_handler == _Py_ERROR_UNKNOWN) {
16150 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16151 return -1;
16152 }
16153
16154 char *encoding, *errors;
16155 if (encode_wstr_utf8(config->filesystem_encoding,
16156 &encoding,
16157 "filesystem_encoding") < 0) {
16158 return -1;
16159 }
16160
16161 if (encode_wstr_utf8(config->filesystem_errors,
16162 &errors,
16163 "filesystem_errors") < 0) {
16164 PyMem_RawFree(encoding);
16165 return -1;
16166 }
16167
Victor Stinner3d17c042020-05-14 01:48:38 +020016168 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16169 PyMem_RawFree(fs_codec->encoding);
16170 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016171 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016172 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16173 PyMem_RawFree(fs_codec->errors);
16174 fs_codec->errors = errors;
16175 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016176
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016177#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016178 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016179#endif
16180
Victor Stinner709d23d2019-05-02 14:56:30 -040016181 /* At this point, PyUnicode_EncodeFSDefault() and
16182 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16183 the C implementation of the filesystem encoding. */
16184
16185 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16186 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016187 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16188 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016189 PyErr_NoMemory();
16190 return -1;
16191 }
16192 return 0;
16193}
16194
16195
Victor Stinner331a6a52019-05-27 16:39:22 +020016196static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016197init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016198{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016199 PyInterpreterState *interp = tstate->interp;
16200
Victor Stinner709d23d2019-05-02 14:56:30 -040016201 /* Update the filesystem encoding to the normalized Python codec name.
16202 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16203 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016204 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016205 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016206 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016207 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016208 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016209 }
16210
Victor Stinner709d23d2019-05-02 14:56:30 -040016211 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016212 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016213 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016214 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016215}
16216
16217
Victor Stinner331a6a52019-05-27 16:39:22 +020016218PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016219_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016220{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016221 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016222 if (_PyStatus_EXCEPTION(status)) {
16223 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016224 }
16225
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016226 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016227}
16228
16229
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016230static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016231_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016232{
Victor Stinner3d17c042020-05-14 01:48:38 +020016233 PyMem_RawFree(fs_codec->encoding);
16234 fs_codec->encoding = NULL;
16235 fs_codec->utf8 = 0;
16236 PyMem_RawFree(fs_codec->errors);
16237 fs_codec->errors = NULL;
16238 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016239}
16240
16241
Victor Stinner709d23d2019-05-02 14:56:30 -040016242#ifdef MS_WINDOWS
16243int
16244_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16245{
Victor Stinner81a7be32020-04-14 15:14:01 +020016246 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016247 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016248
16249 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16250 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16251 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16252 if (encoding == NULL || errors == NULL) {
16253 PyMem_RawFree(encoding);
16254 PyMem_RawFree(errors);
16255 PyErr_NoMemory();
16256 return -1;
16257 }
16258
16259 PyMem_RawFree(config->filesystem_encoding);
16260 config->filesystem_encoding = encoding;
16261 PyMem_RawFree(config->filesystem_errors);
16262 config->filesystem_errors = errors;
16263
16264 return init_fs_codec(interp);
16265}
16266#endif
16267
16268
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016269void
Victor Stinner3d483342019-11-22 12:27:50 +010016270_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016271{
Victor Stinner666ecfb2020-07-02 01:19:57 +020016272 // _PyUnicode_ClearInterned() must be called before
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016273
Victor Stinner666ecfb2020-07-02 01:19:57 +020016274 struct _Py_unicode_state *state = &tstate->interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016275
Victor Stinner91698d82020-06-25 14:07:40 +020016276 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016277
Victor Stinner2f9ada92020-06-24 02:22:21 +020016278 for (Py_ssize_t i = 0; i < 256; i++) {
16279 Py_CLEAR(state->latin1[i]);
16280 }
16281
Victor Stinnerba3d67c2020-12-26 00:41:46 +010016282 unicode_clear_identifiers(tstate);
Victor Stinner709d23d2019-05-02 14:56:30 -040016283
Victor Stinner3d17c042020-05-14 01:48:38 +020016284 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016285}
16286
16287
Georg Brandl66c221e2010-10-14 07:04:07 +000016288/* A _string module, to export formatter_parser and formatter_field_name_split
16289 to the string.Formatter class implemented in Python. */
16290
16291static PyMethodDef _string_methods[] = {
16292 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16293 METH_O, PyDoc_STR("split the argument as a field name")},
16294 {"formatter_parser", (PyCFunction) formatter_parser,
16295 METH_O, PyDoc_STR("parse the argument as a format string")},
16296 {NULL, NULL}
16297};
16298
16299static struct PyModuleDef _string_module = {
16300 PyModuleDef_HEAD_INIT,
Victor Stinnerbb083d32020-09-08 15:33:08 +020016301 .m_name = "_string",
16302 .m_doc = PyDoc_STR("string helper module"),
16303 .m_size = 0,
16304 .m_methods = _string_methods,
Georg Brandl66c221e2010-10-14 07:04:07 +000016305};
16306
16307PyMODINIT_FUNC
16308PyInit__string(void)
16309{
Victor Stinnerbb083d32020-09-08 15:33:08 +020016310 return PyModuleDef_Init(&_string_module);
Georg Brandl66c221e2010-10-14 07:04:07 +000016311}
16312
16313
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016314#ifdef __cplusplus
16315}
16316#endif