blob: aba7407533c4ed28977a7f6d7591fdbf514710e0 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020047#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinnerbcda8f12018-11-21 22:27:47 +010048#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020049#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040050#include "pycore_pylifecycle.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020051#include "pycore_pystate.h" // _PyInterpreterState_GET()
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000052#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070053#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000055#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000056#include <windows.h>
57#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000058
Victor Stinnerfecc4f22019-03-19 14:20:29 +010059/* Uncomment to display statistics on interned strings at exit when
60 using Valgrind or Insecure++. */
61/* #define INTERNED_STATS 1 */
62
63
Larry Hastings61272b72014-01-07 12:41:53 -080064/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090065class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080066[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090067/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69/*[python input]
70class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80[python start generated code]*/
81/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080082
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchaka05997252013-01-26 12:14:02 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner8faf8212011-12-08 22:14:11 +010096/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97#define MAX_UNICODE 0x10ffff
98
Victor Stinner910337b2011-10-03 03:20:16 +020099#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200100# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#else
102# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200104
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105#define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200108 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200113#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200116 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200121#define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
123#define _PyUnicode_WSTR_LENGTH(op) \
124 (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) \
126 (((PyASCIIObject *)(op))->length)
127#define _PyUnicode_STATE(op) \
128 (((PyASCIIObject *)(op))->state)
129#define _PyUnicode_HASH(op) \
130 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_KIND(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200134#define _PyUnicode_GET_LENGTH(op) \
135 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200137#define _PyUnicode_DATA_ANY(op) \
138 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139
Victor Stinner910337b2011-10-03 03:20:16 +0200140#undef PyUnicode_READY
141#define PyUnicode_READY(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200144 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100145 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200146
Victor Stinnerc379ead2011-10-03 12:52:27 +0200147#define _PyUnicode_SHARE_UTF8(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
150 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
151#define _PyUnicode_SHARE_WSTR(op) \
152 (assert(_PyUnicode_CHECK(op)), \
153 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
154
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155/* true if the Unicode object has an allocated UTF-8 memory block
156 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200157#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200158 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200159 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200160 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
161
Victor Stinner03490912011-10-03 23:45:12 +0200162/* true if the Unicode object has an allocated wstr memory block
163 (not shared with other data) */
164#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200165 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100176 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600177 const from_type *_iter = (const from_type *)(begin);\
178 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 Py_ssize_t n = (_end) - (_iter); \
180 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200181 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200182 while (_iter < (_unrolled_end)) { \
183 _to[0] = (to_type) _iter[0]; \
184 _to[1] = (to_type) _iter[1]; \
185 _to[2] = (to_type) _iter[2]; \
186 _to[3] = (to_type) _iter[3]; \
187 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200189 while (_iter < (_end)) \
190 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200191 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200192
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200193#ifdef MS_WINDOWS
194 /* On Windows, overallocate by 50% is the best factor */
195# define OVERALLOCATE_FACTOR 2
196#else
197 /* On Linux, overallocate by 25% is the best factor */
198# define OVERALLOCATE_FACTOR 4
199#endif
200
Walter Dörwald16807132007-05-25 13:52:07 +0000201/* This dictionary holds all interned unicode strings. Note that references
202 to strings in this dictionary are *not* counted in the string's ob_refcnt.
203 When the interned string reaches a refcnt of 0 the string deallocation
204 function will delete the reference from this dictionary.
205
206 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000207 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000208*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000210
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000211/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213
Serhiy Storchaka678db842013-01-26 12:16:36 +0200214#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215 do { \
216 if (unicode_empty != NULL) \
217 Py_INCREF(unicode_empty); \
218 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 unicode_empty = PyUnicode_New(0, 0); \
220 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200222 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
223 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200224 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200225 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000226
Serhiy Storchaka678db842013-01-26 12:16:36 +0200227#define _Py_RETURN_UNICODE_EMPTY() \
228 do { \
229 _Py_INCREF_UNICODE_EMPTY(); \
230 return unicode_empty; \
231 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000232
Victor Stinner59423e32018-11-26 13:40:01 +0100233static inline void
234unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
235 Py_ssize_t start, Py_ssize_t length)
236{
237 assert(0 <= start);
238 assert(kind != PyUnicode_WCHAR_KIND);
239 switch (kind) {
240 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100241 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100242 Py_UCS1 ch = (unsigned char)value;
243 Py_UCS1 *to = (Py_UCS1 *)data + start;
244 memset(to, ch, length);
245 break;
246 }
247 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100248 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100249 Py_UCS2 ch = (Py_UCS2)value;
250 Py_UCS2 *to = (Py_UCS2 *)data + start;
251 const Py_UCS2 *end = to + length;
252 for (; to < end; ++to) *to = ch;
253 break;
254 }
255 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100256 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100257 Py_UCS4 ch = value;
258 Py_UCS4 * to = (Py_UCS4 *)data + start;
259 const Py_UCS4 *end = to + length;
260 for (; to < end; ++to) *to = ch;
261 break;
262 }
263 default: Py_UNREACHABLE();
264 }
265}
266
267
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200268/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700269static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200270_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900271static inline void
272_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400273static PyObject *
274unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
275 const char *errors);
276static PyObject *
277unicode_decode_utf8(const char *s, Py_ssize_t size,
278 _Py_error_handler error_handler, const char *errors,
279 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200280
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200281/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200282static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200283
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284/* Single character Unicode strings in the Latin-1 range are being
285 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200286static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000287
Christian Heimes190d79e2008-01-30 11:58:22 +0000288/* Fast detection of the most frequent whitespace characters */
289const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000293/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* case 0x000C: * FORM FEED */
295/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 1, 1, 1, 1, 1, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* case 0x001C: * FILE SEPARATOR */
299/* case 0x001D: * GROUP SEPARATOR */
300/* case 0x001E: * RECORD SEPARATOR */
301/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000303/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 1, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000308
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000317};
318
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200319/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200320static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200321static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100322static int unicode_modifiable(PyObject *unicode);
323
Victor Stinnerfe226c02011-10-03 03:52:20 +0200324
Alexander Belopolsky40018472011-02-26 01:02:56 +0000325static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100326_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200327static PyObject *
328_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
329static PyObject *
330_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
331
332static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000333unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000334 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100335 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000336 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
337
Alexander Belopolsky40018472011-02-26 01:02:56 +0000338static void
339raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300340 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100341 PyObject *unicode,
342 Py_ssize_t startpos, Py_ssize_t endpos,
343 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000344
Christian Heimes190d79e2008-01-30 11:58:22 +0000345/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200346static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000348/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000349/* 0x000B, * LINE TABULATION */
350/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000351/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000352 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000353 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000354/* 0x001C, * FILE SEPARATOR */
355/* 0x001D, * GROUP SEPARATOR */
356/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000357 0, 0, 0, 0, 1, 1, 1, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000362
Benjamin Peterson14339b62009-01-31 16:36:08 +0000363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0,
369 0, 0, 0, 0, 0, 0, 0, 0,
370 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000371};
372
INADA Naoki3ae20562017-01-16 20:41:20 +0900373static int convert_uc(PyObject *obj, void *addr);
374
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300375#include "clinic/unicodeobject.c.h"
376
Victor Stinner3d4226a2018-08-29 22:21:32 +0200377_Py_error_handler
378_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200379{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200381 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200382 }
383 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200384 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200385 }
386 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200387 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200388 }
389 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200390 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200391 }
392 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200393 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 }
395 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200396 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200397 }
398 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200400 }
Victor Stinner50149202015-09-22 00:26:54 +0200401 return _Py_ERROR_OTHER;
402}
403
Victor Stinner709d23d2019-05-02 14:56:30 -0400404
405static _Py_error_handler
406get_error_handler_wide(const wchar_t *errors)
407{
408 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
409 return _Py_ERROR_STRICT;
410 }
411 if (wcscmp(errors, L"surrogateescape") == 0) {
412 return _Py_ERROR_SURROGATEESCAPE;
413 }
414 if (wcscmp(errors, L"replace") == 0) {
415 return _Py_ERROR_REPLACE;
416 }
417 if (wcscmp(errors, L"ignore") == 0) {
418 return _Py_ERROR_IGNORE;
419 }
420 if (wcscmp(errors, L"backslashreplace") == 0) {
421 return _Py_ERROR_BACKSLASHREPLACE;
422 }
423 if (wcscmp(errors, L"surrogatepass") == 0) {
424 return _Py_ERROR_SURROGATEPASS;
425 }
426 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
427 return _Py_ERROR_XMLCHARREFREPLACE;
428 }
429 return _Py_ERROR_OTHER;
430}
431
432
Victor Stinner22eb6892019-06-26 00:51:05 +0200433static inline int
434unicode_check_encoding_errors(const char *encoding, const char *errors)
435{
436 if (encoding == NULL && errors == NULL) {
437 return 0;
438 }
439
Victor Stinner81a7be32020-04-14 15:14:01 +0200440 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200441#ifndef Py_DEBUG
442 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200443 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200444 return 0;
445 }
446#else
447 /* Always check in debug mode */
448#endif
449
450 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
451 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
452 if (!interp->fs_codec.encoding) {
453 return 0;
454 }
455
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200456 /* Disable checks during Python finalization. For example, it allows to
457 call _PyObject_Dump() during finalization for debugging purpose. */
458 if (interp->finalizing) {
459 return 0;
460 }
461
Victor Stinner22eb6892019-06-26 00:51:05 +0200462 if (encoding != NULL) {
463 PyObject *handler = _PyCodec_Lookup(encoding);
464 if (handler == NULL) {
465 return -1;
466 }
467 Py_DECREF(handler);
468 }
469
470 if (errors != NULL) {
471 PyObject *handler = PyCodec_LookupError(errors);
472 if (handler == NULL) {
473 return -1;
474 }
475 Py_DECREF(handler);
476 }
477 return 0;
478}
479
480
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300481/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
482 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000483Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000484PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000485{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000486#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000487 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000488#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000489 /* This is actually an illegal character, so it should
490 not be passed to unichr. */
491 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000492#endif
493}
494
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200495int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100496_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200497{
Victor Stinner68762572019-10-07 18:42:01 +0200498#define CHECK(expr) \
499 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
500
Victor Stinner910337b2011-10-03 03:20:16 +0200501 PyASCIIObject *ascii;
502 unsigned int kind;
503
Victor Stinner68762572019-10-07 18:42:01 +0200504 assert(op != NULL);
505 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200506
507 ascii = (PyASCIIObject *)op;
508 kind = ascii->state.kind;
509
Victor Stinnera3b334d2011-10-03 13:53:37 +0200510 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200511 CHECK(kind == PyUnicode_1BYTE_KIND);
512 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200513 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200514 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200515 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200516 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200517
Victor Stinnera41463c2011-10-04 01:05:08 +0200518 if (ascii->state.compact == 1) {
519 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200520 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200521 || kind == PyUnicode_2BYTE_KIND
522 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200523 CHECK(ascii->state.ascii == 0);
524 CHECK(ascii->state.ready == 1);
525 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100526 }
527 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200528 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
529
530 data = unicode->data.any;
531 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200532 CHECK(ascii->length == 0);
533 CHECK(ascii->hash == -1);
534 CHECK(ascii->state.compact == 0);
535 CHECK(ascii->state.ascii == 0);
536 CHECK(ascii->state.ready == 0);
537 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
538 CHECK(ascii->wstr != NULL);
539 CHECK(data == NULL);
540 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200541 }
542 else {
Victor Stinner68762572019-10-07 18:42:01 +0200543 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200544 || kind == PyUnicode_2BYTE_KIND
545 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(ascii->state.compact == 0);
547 CHECK(ascii->state.ready == 1);
548 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200549 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200550 CHECK(compact->utf8 == data);
551 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200552 }
553 else
Victor Stinner68762572019-10-07 18:42:01 +0200554 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200555 }
556 }
557 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200558 if (
559#if SIZEOF_WCHAR_T == 2
560 kind == PyUnicode_2BYTE_KIND
561#else
562 kind == PyUnicode_4BYTE_KIND
563#endif
564 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200565 {
Victor Stinner68762572019-10-07 18:42:01 +0200566 CHECK(ascii->wstr == data);
567 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200568 } else
Victor Stinner68762572019-10-07 18:42:01 +0200569 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200570 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200571
572 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200573 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200574 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200575 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200576 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200577
578 /* check that the best kind is used: O(n) operation */
579 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200580 Py_ssize_t i;
581 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300582 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200583 Py_UCS4 ch;
584
585 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200586 for (i=0; i < ascii->length; i++)
587 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200588 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200589 if (ch > maxchar)
590 maxchar = ch;
591 }
592 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100593 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200594 CHECK(maxchar >= 128);
595 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100596 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200597 else
Victor Stinner68762572019-10-07 18:42:01 +0200598 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200599 }
Victor Stinner77faf692011-11-20 18:56:05 +0100600 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200601 CHECK(maxchar >= 0x100);
602 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100603 }
604 else {
Victor Stinner68762572019-10-07 18:42:01 +0200605 CHECK(maxchar >= 0x10000);
606 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100607 }
Victor Stinner68762572019-10-07 18:42:01 +0200608 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200609 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400610 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200611
612#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400613}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200614
Victor Stinner910337b2011-10-03 03:20:16 +0200615
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616static PyObject*
617unicode_result_wchar(PyObject *unicode)
618{
619#ifndef Py_DEBUG
620 Py_ssize_t len;
621
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100622 len = _PyUnicode_WSTR_LENGTH(unicode);
623 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100624 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200625 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100626 }
627
628 if (len == 1) {
629 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100630 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
632 Py_DECREF(unicode);
633 return latin1_char;
634 }
635 }
636
637 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200638 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100639 return NULL;
640 }
641#else
Victor Stinneraa771272012-10-04 02:32:58 +0200642 assert(Py_REFCNT(unicode) == 1);
643
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100644 /* don't make the result ready in debug mode to ensure that the caller
645 makes the string ready before using it */
646 assert(_PyUnicode_CheckConsistency(unicode, 1));
647#endif
648 return unicode;
649}
650
651static PyObject*
652unicode_result_ready(PyObject *unicode)
653{
654 Py_ssize_t length;
655
656 length = PyUnicode_GET_LENGTH(unicode);
657 if (length == 0) {
658 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100659 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200660 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100661 }
662 return unicode_empty;
663 }
664
665 if (length == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300666 const void *data = PyUnicode_DATA(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +0200667 int kind = PyUnicode_KIND(unicode);
668 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100669 if (ch < 256) {
670 PyObject *latin1_char = unicode_latin1[ch];
671 if (latin1_char != NULL) {
672 if (unicode != latin1_char) {
673 Py_INCREF(latin1_char);
674 Py_DECREF(unicode);
675 }
676 return latin1_char;
677 }
678 else {
679 assert(_PyUnicode_CheckConsistency(unicode, 1));
680 Py_INCREF(unicode);
681 unicode_latin1[ch] = unicode;
682 return unicode;
683 }
684 }
685 }
686
687 assert(_PyUnicode_CheckConsistency(unicode, 1));
688 return unicode;
689}
690
691static PyObject*
692unicode_result(PyObject *unicode)
693{
694 assert(_PyUnicode_CHECK(unicode));
695 if (PyUnicode_IS_READY(unicode))
696 return unicode_result_ready(unicode);
697 else
698 return unicode_result_wchar(unicode);
699}
700
Victor Stinnerc4b49542011-12-11 22:44:26 +0100701static PyObject*
702unicode_result_unchanged(PyObject *unicode)
703{
704 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500705 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100706 return NULL;
707 Py_INCREF(unicode);
708 return unicode;
709 }
710 else
711 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100712 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100713}
714
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200715/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
716 ASCII, Latin1, UTF-8, etc. */
717static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200718backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200719 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
720{
Victor Stinnerad771582015-10-09 12:38:53 +0200721 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200722 Py_UCS4 ch;
723 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300724 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200725
726 assert(PyUnicode_IS_READY(unicode));
727 kind = PyUnicode_KIND(unicode);
728 data = PyUnicode_DATA(unicode);
729
730 size = 0;
731 /* determine replacement size */
732 for (i = collstart; i < collend; ++i) {
733 Py_ssize_t incr;
734
735 ch = PyUnicode_READ(kind, data, i);
736 if (ch < 0x100)
737 incr = 2+2;
738 else if (ch < 0x10000)
739 incr = 2+4;
740 else {
741 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200742 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200743 }
744 if (size > PY_SSIZE_T_MAX - incr) {
745 PyErr_SetString(PyExc_OverflowError,
746 "encoded result is too long for a Python string");
747 return NULL;
748 }
749 size += incr;
750 }
751
Victor Stinnerad771582015-10-09 12:38:53 +0200752 str = _PyBytesWriter_Prepare(writer, str, size);
753 if (str == NULL)
754 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200755
756 /* generate replacement */
757 for (i = collstart; i < collend; ++i) {
758 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200759 *str++ = '\\';
760 if (ch >= 0x00010000) {
761 *str++ = 'U';
762 *str++ = Py_hexdigits[(ch>>28)&0xf];
763 *str++ = Py_hexdigits[(ch>>24)&0xf];
764 *str++ = Py_hexdigits[(ch>>20)&0xf];
765 *str++ = Py_hexdigits[(ch>>16)&0xf];
766 *str++ = Py_hexdigits[(ch>>12)&0xf];
767 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200768 }
Victor Stinner797485e2015-10-09 03:17:30 +0200769 else if (ch >= 0x100) {
770 *str++ = 'u';
771 *str++ = Py_hexdigits[(ch>>12)&0xf];
772 *str++ = Py_hexdigits[(ch>>8)&0xf];
773 }
774 else
775 *str++ = 'x';
776 *str++ = Py_hexdigits[(ch>>4)&0xf];
777 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778 }
779 return str;
780}
781
782/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
783 ASCII, Latin1, UTF-8, etc. */
784static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200785xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200786 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
787{
Victor Stinnerad771582015-10-09 12:38:53 +0200788 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200789 Py_UCS4 ch;
790 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300791 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200792
793 assert(PyUnicode_IS_READY(unicode));
794 kind = PyUnicode_KIND(unicode);
795 data = PyUnicode_DATA(unicode);
796
797 size = 0;
798 /* determine replacement size */
799 for (i = collstart; i < collend; ++i) {
800 Py_ssize_t incr;
801
802 ch = PyUnicode_READ(kind, data, i);
803 if (ch < 10)
804 incr = 2+1+1;
805 else if (ch < 100)
806 incr = 2+2+1;
807 else if (ch < 1000)
808 incr = 2+3+1;
809 else if (ch < 10000)
810 incr = 2+4+1;
811 else if (ch < 100000)
812 incr = 2+5+1;
813 else if (ch < 1000000)
814 incr = 2+6+1;
815 else {
816 assert(ch <= MAX_UNICODE);
817 incr = 2+7+1;
818 }
819 if (size > PY_SSIZE_T_MAX - incr) {
820 PyErr_SetString(PyExc_OverflowError,
821 "encoded result is too long for a Python string");
822 return NULL;
823 }
824 size += incr;
825 }
826
Victor Stinnerad771582015-10-09 12:38:53 +0200827 str = _PyBytesWriter_Prepare(writer, str, size);
828 if (str == NULL)
829 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200830
831 /* generate replacement */
832 for (i = collstart; i < collend; ++i) {
833 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
834 }
835 return str;
836}
837
Thomas Wouters477c8d52006-05-27 19:21:47 +0000838/* --- Bloom Filters ----------------------------------------------------- */
839
840/* stuff to implement simple "bloom filters" for Unicode characters.
841 to keep things simple, we use a single bitmask, using the least 5
842 bits from each unicode characters as the bit index. */
843
844/* the linebreak mask is set up by Unicode_Init below */
845
Antoine Pitrouf068f942010-01-13 14:19:12 +0000846#if LONG_BIT >= 128
847#define BLOOM_WIDTH 128
848#elif LONG_BIT >= 64
849#define BLOOM_WIDTH 64
850#elif LONG_BIT >= 32
851#define BLOOM_WIDTH 32
852#else
853#error "LONG_BIT is smaller than 32"
854#endif
855
Thomas Wouters477c8d52006-05-27 19:21:47 +0000856#define BLOOM_MASK unsigned long
857
Serhiy Storchaka05997252013-01-26 12:14:02 +0200858static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000859
Antoine Pitrouf068f942010-01-13 14:19:12 +0000860#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000861
Benjamin Peterson29060642009-01-31 22:14:21 +0000862#define BLOOM_LINEBREAK(ch) \
863 ((ch) < 128U ? ascii_linebreak[(ch)] : \
864 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000865
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700866static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300867make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000868{
Victor Stinnera85af502013-04-09 21:53:54 +0200869#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
870 do { \
871 TYPE *data = (TYPE *)PTR; \
872 TYPE *end = data + LEN; \
873 Py_UCS4 ch; \
874 for (; data != end; data++) { \
875 ch = *data; \
876 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
877 } \
878 break; \
879 } while (0)
880
Thomas Wouters477c8d52006-05-27 19:21:47 +0000881 /* calculate simple bloom-style bitmask for a given unicode string */
882
Antoine Pitrouf068f942010-01-13 14:19:12 +0000883 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884
885 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200886 switch (kind) {
887 case PyUnicode_1BYTE_KIND:
888 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
889 break;
890 case PyUnicode_2BYTE_KIND:
891 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
892 break;
893 case PyUnicode_4BYTE_KIND:
894 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
895 break;
896 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700897 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200898 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000899 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200900
901#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000902}
903
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300904static int
905ensure_unicode(PyObject *obj)
906{
907 if (!PyUnicode_Check(obj)) {
908 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200909 "must be str, not %.100s",
910 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300911 return -1;
912 }
913 return PyUnicode_READY(obj);
914}
915
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200916/* Compilation of templated routines */
917
918#include "stringlib/asciilib.h"
919#include "stringlib/fastsearch.h"
920#include "stringlib/partition.h"
921#include "stringlib/split.h"
922#include "stringlib/count.h"
923#include "stringlib/find.h"
924#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200925#include "stringlib/undef.h"
926
927#include "stringlib/ucs1lib.h"
928#include "stringlib/fastsearch.h"
929#include "stringlib/partition.h"
930#include "stringlib/split.h"
931#include "stringlib/count.h"
932#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300933#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200934#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200935#include "stringlib/undef.h"
936
937#include "stringlib/ucs2lib.h"
938#include "stringlib/fastsearch.h"
939#include "stringlib/partition.h"
940#include "stringlib/split.h"
941#include "stringlib/count.h"
942#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300943#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200944#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200945#include "stringlib/undef.h"
946
947#include "stringlib/ucs4lib.h"
948#include "stringlib/fastsearch.h"
949#include "stringlib/partition.h"
950#include "stringlib/split.h"
951#include "stringlib/count.h"
952#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300953#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200954#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200955#include "stringlib/undef.h"
956
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200957#include "stringlib/unicodedefs.h"
958#include "stringlib/fastsearch.h"
959#include "stringlib/count.h"
960#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100961#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200962
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963/* --- Unicode Object ----------------------------------------------------- */
964
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700965static inline Py_ssize_t
966findchar(const void *s, int kind,
967 Py_ssize_t size, Py_UCS4 ch,
968 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970 switch (kind) {
971 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200972 if ((Py_UCS1) ch != ch)
973 return -1;
974 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600975 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200976 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600977 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200978 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200979 if ((Py_UCS2) ch != ch)
980 return -1;
981 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600982 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200983 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600984 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200985 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200986 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600987 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600989 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200990 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700991 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200992 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993}
994
Victor Stinnerafffce42012-10-03 23:03:17 +0200995#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000996/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200997 earlier.
998
999 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1000 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1001 invalid character in Unicode 6.0. */
1002static void
1003unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1004{
1005 int kind = PyUnicode_KIND(unicode);
1006 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1007 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1008 if (length <= old_length)
1009 return;
1010 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1011}
1012#endif
1013
Victor Stinnerfe226c02011-10-03 03:52:20 +02001014static PyObject*
1015resize_compact(PyObject *unicode, Py_ssize_t length)
1016{
1017 Py_ssize_t char_size;
1018 Py_ssize_t struct_size;
1019 Py_ssize_t new_size;
1020 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001021 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001022#ifdef Py_DEBUG
1023 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1024#endif
1025
Victor Stinner79891572012-05-03 13:43:07 +02001026 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001027 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001028 assert(PyUnicode_IS_COMPACT(unicode));
1029
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001030 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001031 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001032 struct_size = sizeof(PyASCIIObject);
1033 else
1034 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001035 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036
Victor Stinnerfe226c02011-10-03 03:52:20 +02001037 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1038 PyErr_NoMemory();
1039 return NULL;
1040 }
1041 new_size = (struct_size + (length + 1) * char_size);
1042
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001043 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1044 PyObject_DEL(_PyUnicode_UTF8(unicode));
1045 _PyUnicode_UTF8(unicode) = NULL;
1046 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1047 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001048#ifdef Py_REF_DEBUG
1049 _Py_RefTotal--;
1050#endif
1051#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001052 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001053#endif
Victor Stinner84def372011-12-11 20:04:56 +01001054
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001055 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001056 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001057 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001058 PyErr_NoMemory();
1059 return NULL;
1060 }
Victor Stinner84def372011-12-11 20:04:56 +01001061 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001063
Victor Stinnerfe226c02011-10-03 03:52:20 +02001064 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001065 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001066 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001067 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001068 _PyUnicode_WSTR_LENGTH(unicode) = length;
1069 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001070 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1071 PyObject_DEL(_PyUnicode_WSTR(unicode));
1072 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001073 if (!PyUnicode_IS_ASCII(unicode))
1074 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001075 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001076#ifdef Py_DEBUG
1077 unicode_fill_invalid(unicode, old_length);
1078#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001079 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1080 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001081 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 return unicode;
1083}
1084
Alexander Belopolsky40018472011-02-26 01:02:56 +00001085static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001086resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087{
Victor Stinner95663112011-10-04 01:03:50 +02001088 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001089 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001091 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001092
Victor Stinnerfe226c02011-10-03 03:52:20 +02001093 if (PyUnicode_IS_READY(unicode)) {
1094 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001095 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001096 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001097#ifdef Py_DEBUG
1098 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1099#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001100
1101 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001102 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001103 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1104 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001105
1106 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1107 PyErr_NoMemory();
1108 return -1;
1109 }
1110 new_size = (length + 1) * char_size;
1111
Victor Stinner7a9105a2011-12-12 00:13:42 +01001112 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1113 {
1114 PyObject_DEL(_PyUnicode_UTF8(unicode));
1115 _PyUnicode_UTF8(unicode) = NULL;
1116 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1117 }
1118
Victor Stinnerfe226c02011-10-03 03:52:20 +02001119 data = (PyObject *)PyObject_REALLOC(data, new_size);
1120 if (data == NULL) {
1121 PyErr_NoMemory();
1122 return -1;
1123 }
1124 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001125 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001126 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001127 _PyUnicode_WSTR_LENGTH(unicode) = length;
1128 }
1129 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001130 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001131 _PyUnicode_UTF8_LENGTH(unicode) = length;
1132 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001133 _PyUnicode_LENGTH(unicode) = length;
1134 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001135#ifdef Py_DEBUG
1136 unicode_fill_invalid(unicode, old_length);
1137#endif
Victor Stinner95663112011-10-04 01:03:50 +02001138 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001139 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001140 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001141 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 }
Victor Stinner95663112011-10-04 01:03:50 +02001143 assert(_PyUnicode_WSTR(unicode) != NULL);
1144
1145 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001146 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001147 PyErr_NoMemory();
1148 return -1;
1149 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001150 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001151 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001152 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001153 if (!wstr) {
1154 PyErr_NoMemory();
1155 return -1;
1156 }
1157 _PyUnicode_WSTR(unicode) = wstr;
1158 _PyUnicode_WSTR(unicode)[length] = 0;
1159 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001160 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 return 0;
1162}
1163
Victor Stinnerfe226c02011-10-03 03:52:20 +02001164static PyObject*
1165resize_copy(PyObject *unicode, Py_ssize_t length)
1166{
1167 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001169 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001170
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001171 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001172
1173 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1174 if (copy == NULL)
1175 return NULL;
1176
1177 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001178 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001179 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001180 }
1181 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001182 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001183
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001184 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 if (w == NULL)
1186 return NULL;
1187 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1188 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001189 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001190 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001191 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001192 }
1193}
1194
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001196 Ux0000 terminated; some code (e.g. new_identifier)
1197 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198
1199 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001200 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202*/
1203
Alexander Belopolsky40018472011-02-26 01:02:56 +00001204static PyUnicodeObject *
1205_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001207 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
Thomas Wouters477c8d52006-05-27 19:21:47 +00001210 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 if (length == 0 && unicode_empty != NULL) {
1212 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001213 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214 }
1215
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001216 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001217 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001218 return (PyUnicodeObject *)PyErr_NoMemory();
1219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220 if (length < 0) {
1221 PyErr_SetString(PyExc_SystemError,
1222 "Negative size passed to _PyUnicode_New");
1223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 }
1225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1227 if (unicode == NULL)
1228 return NULL;
1229 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001230
1231 _PyUnicode_WSTR_LENGTH(unicode) = length;
1232 _PyUnicode_HASH(unicode) = -1;
1233 _PyUnicode_STATE(unicode).interned = 0;
1234 _PyUnicode_STATE(unicode).kind = 0;
1235 _PyUnicode_STATE(unicode).compact = 0;
1236 _PyUnicode_STATE(unicode).ready = 0;
1237 _PyUnicode_STATE(unicode).ascii = 0;
1238 _PyUnicode_DATA_ANY(unicode) = NULL;
1239 _PyUnicode_LENGTH(unicode) = 0;
1240 _PyUnicode_UTF8(unicode) = NULL;
1241 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1244 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001245 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001246 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001247 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249
Jeremy Hyltond8082792003-09-16 19:41:39 +00001250 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001251 * the caller fails before initializing str -- unicode_resize()
1252 * reads str[0], and the Keep-Alive optimization can keep memory
1253 * allocated for str alive across a call to unicode_dealloc(unicode).
1254 * We don't want unicode_resize to read uninitialized memory in
1255 * that case.
1256 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 _PyUnicode_WSTR(unicode)[0] = 0;
1258 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001259
Victor Stinner7931d9a2011-11-04 00:22:48 +01001260 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001261 return unicode;
1262}
1263
Victor Stinnerf42dc442011-10-02 23:33:16 +02001264static const char*
1265unicode_kind_name(PyObject *unicode)
1266{
Victor Stinner42dfd712011-10-03 14:41:45 +02001267 /* don't check consistency: unicode_kind_name() is called from
1268 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001269 if (!PyUnicode_IS_COMPACT(unicode))
1270 {
1271 if (!PyUnicode_IS_READY(unicode))
1272 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001273 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001274 {
1275 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001276 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001277 return "legacy ascii";
1278 else
1279 return "legacy latin1";
1280 case PyUnicode_2BYTE_KIND:
1281 return "legacy UCS2";
1282 case PyUnicode_4BYTE_KIND:
1283 return "legacy UCS4";
1284 default:
1285 return "<legacy invalid kind>";
1286 }
1287 }
1288 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001289 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001290 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001291 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292 return "ascii";
1293 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001294 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001295 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001296 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001297 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001298 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001299 default:
1300 return "<invalid compact kind>";
1301 }
1302}
1303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001306const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001307 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001308 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309}
1310
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001311const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001312 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 return _PyUnicode_COMPACT_DATA(unicode);
1314}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001315const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001316 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001317 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1319 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1320 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1321 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1322 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1323 return PyUnicode_DATA(unicode);
1324}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001325
1326void
1327_PyUnicode_Dump(PyObject *op)
1328{
1329 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001330 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1331 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001332 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001333
Victor Stinnera849a4b2011-10-03 12:12:11 +02001334 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001335 {
1336 if (ascii->state.ascii)
1337 data = (ascii + 1);
1338 else
1339 data = (compact + 1);
1340 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001341 else
1342 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001343 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1344 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001345
Victor Stinnera849a4b2011-10-03 12:12:11 +02001346 if (ascii->wstr == data)
1347 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001348 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001349
Victor Stinnera3b334d2011-10-03 13:53:37 +02001350 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001351 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001352 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1353 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001354 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001355 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001356 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001357 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001358}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359#endif
1360
1361PyObject *
1362PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1363{
1364 PyObject *obj;
1365 PyCompactUnicodeObject *unicode;
1366 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001367 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001368 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 Py_ssize_t char_size;
1370 Py_ssize_t struct_size;
1371
1372 /* Optimization for empty strings */
1373 if (size == 0 && unicode_empty != NULL) {
1374 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001375 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 }
1377
Victor Stinner9e9d6892011-10-04 01:02:02 +02001378 is_ascii = 0;
1379 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 struct_size = sizeof(PyCompactUnicodeObject);
1381 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001382 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 char_size = 1;
1384 is_ascii = 1;
1385 struct_size = sizeof(PyASCIIObject);
1386 }
1387 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001388 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 char_size = 1;
1390 }
1391 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001392 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 char_size = 2;
1394 if (sizeof(wchar_t) == 2)
1395 is_sharing = 1;
1396 }
1397 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001398 if (maxchar > MAX_UNICODE) {
1399 PyErr_SetString(PyExc_SystemError,
1400 "invalid maximum character passed to PyUnicode_New");
1401 return NULL;
1402 }
Victor Stinner8f825062012-04-27 13:55:39 +02001403 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 char_size = 4;
1405 if (sizeof(wchar_t) == 4)
1406 is_sharing = 1;
1407 }
1408
1409 /* Ensure we won't overflow the size. */
1410 if (size < 0) {
1411 PyErr_SetString(PyExc_SystemError,
1412 "Negative size passed to PyUnicode_New");
1413 return NULL;
1414 }
1415 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1416 return PyErr_NoMemory();
1417
1418 /* Duplicated allocation code from _PyObject_New() instead of a call to
1419 * PyObject_New() so we are able to allocate space for the object and
1420 * it's data buffer.
1421 */
1422 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1423 if (obj == NULL)
1424 return PyErr_NoMemory();
1425 obj = PyObject_INIT(obj, &PyUnicode_Type);
1426 if (obj == NULL)
1427 return NULL;
1428
1429 unicode = (PyCompactUnicodeObject *)obj;
1430 if (is_ascii)
1431 data = ((PyASCIIObject*)obj) + 1;
1432 else
1433 data = unicode + 1;
1434 _PyUnicode_LENGTH(unicode) = size;
1435 _PyUnicode_HASH(unicode) = -1;
1436 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001437 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 _PyUnicode_STATE(unicode).compact = 1;
1439 _PyUnicode_STATE(unicode).ready = 1;
1440 _PyUnicode_STATE(unicode).ascii = is_ascii;
1441 if (is_ascii) {
1442 ((char*)data)[size] = 0;
1443 _PyUnicode_WSTR(unicode) = NULL;
1444 }
Victor Stinner8f825062012-04-27 13:55:39 +02001445 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 ((char*)data)[size] = 0;
1447 _PyUnicode_WSTR(unicode) = NULL;
1448 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001450 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 else {
1453 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001454 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001455 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001457 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 ((Py_UCS4*)data)[size] = 0;
1459 if (is_sharing) {
1460 _PyUnicode_WSTR_LENGTH(unicode) = size;
1461 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1462 }
1463 else {
1464 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1465 _PyUnicode_WSTR(unicode) = NULL;
1466 }
1467 }
Victor Stinner8f825062012-04-27 13:55:39 +02001468#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001469 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001470#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001471 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 return obj;
1473}
1474
1475#if SIZEOF_WCHAR_T == 2
1476/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1477 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001478 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479
1480 This function assumes that unicode can hold one more code point than wstr
1481 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001482static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001484 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485{
1486 const wchar_t *iter;
1487 Py_UCS4 *ucs4_out;
1488
Victor Stinner910337b2011-10-03 03:20:16 +02001489 assert(unicode != NULL);
1490 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1492 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1493
1494 for (iter = begin; iter < end; ) {
1495 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1496 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001497 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1498 && (iter+1) < end
1499 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500 {
Victor Stinner551ac952011-11-29 22:58:13 +01001501 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502 iter += 2;
1503 }
1504 else {
1505 *ucs4_out++ = *iter;
1506 iter++;
1507 }
1508 }
1509 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1510 _PyUnicode_GET_LENGTH(unicode)));
1511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512}
1513#endif
1514
Victor Stinnercd9950f2011-10-02 00:34:53 +02001515static int
Victor Stinner488fa492011-12-12 00:01:39 +01001516unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001517{
Victor Stinner488fa492011-12-12 00:01:39 +01001518 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001519 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001520 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001521 return -1;
1522 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001523 return 0;
1524}
1525
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001526static int
1527_copy_characters(PyObject *to, Py_ssize_t to_start,
1528 PyObject *from, Py_ssize_t from_start,
1529 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001530{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001531 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001532 const void *from_data;
1533 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534
Victor Stinneree4544c2012-05-09 22:24:08 +02001535 assert(0 <= how_many);
1536 assert(0 <= from_start);
1537 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001540 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001541
Victor Stinnerd3f08822012-05-29 12:57:52 +02001542 assert(PyUnicode_Check(to));
1543 assert(PyUnicode_IS_READY(to));
1544 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1545
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001546 if (how_many == 0)
1547 return 0;
1548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001550 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001552 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553
Victor Stinnerf1852262012-06-16 16:38:26 +02001554#ifdef Py_DEBUG
1555 if (!check_maxchar
1556 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1557 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001558 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001559 Py_UCS4 ch;
1560 Py_ssize_t i;
1561 for (i=0; i < how_many; i++) {
1562 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1563 assert(ch <= to_maxchar);
1564 }
1565 }
1566#endif
1567
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001568 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001569 if (check_maxchar
1570 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1571 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001572 /* Writing Latin-1 characters into an ASCII string requires to
1573 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001574 Py_UCS4 max_char;
1575 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001576 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001577 if (max_char >= 128)
1578 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001579 }
Christian Heimesf051e432016-09-13 20:22:02 +02001580 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001581 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001582 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001584 else if (from_kind == PyUnicode_1BYTE_KIND
1585 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001586 {
1587 _PyUnicode_CONVERT_BYTES(
1588 Py_UCS1, Py_UCS2,
1589 PyUnicode_1BYTE_DATA(from) + from_start,
1590 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1591 PyUnicode_2BYTE_DATA(to) + to_start
1592 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001593 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001594 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001595 && to_kind == PyUnicode_4BYTE_KIND)
1596 {
1597 _PyUnicode_CONVERT_BYTES(
1598 Py_UCS1, Py_UCS4,
1599 PyUnicode_1BYTE_DATA(from) + from_start,
1600 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1601 PyUnicode_4BYTE_DATA(to) + to_start
1602 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001603 }
1604 else if (from_kind == PyUnicode_2BYTE_KIND
1605 && to_kind == PyUnicode_4BYTE_KIND)
1606 {
1607 _PyUnicode_CONVERT_BYTES(
1608 Py_UCS2, Py_UCS4,
1609 PyUnicode_2BYTE_DATA(from) + from_start,
1610 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1611 PyUnicode_4BYTE_DATA(to) + to_start
1612 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001613 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001614 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001615 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1616
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001617 if (!check_maxchar) {
1618 if (from_kind == PyUnicode_2BYTE_KIND
1619 && to_kind == PyUnicode_1BYTE_KIND)
1620 {
1621 _PyUnicode_CONVERT_BYTES(
1622 Py_UCS2, Py_UCS1,
1623 PyUnicode_2BYTE_DATA(from) + from_start,
1624 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1625 PyUnicode_1BYTE_DATA(to) + to_start
1626 );
1627 }
1628 else if (from_kind == PyUnicode_4BYTE_KIND
1629 && to_kind == PyUnicode_1BYTE_KIND)
1630 {
1631 _PyUnicode_CONVERT_BYTES(
1632 Py_UCS4, Py_UCS1,
1633 PyUnicode_4BYTE_DATA(from) + from_start,
1634 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1635 PyUnicode_1BYTE_DATA(to) + to_start
1636 );
1637 }
1638 else if (from_kind == PyUnicode_4BYTE_KIND
1639 && to_kind == PyUnicode_2BYTE_KIND)
1640 {
1641 _PyUnicode_CONVERT_BYTES(
1642 Py_UCS4, Py_UCS2,
1643 PyUnicode_4BYTE_DATA(from) + from_start,
1644 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1645 PyUnicode_2BYTE_DATA(to) + to_start
1646 );
1647 }
1648 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001649 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001650 }
1651 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001652 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001653 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001654 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001655 Py_ssize_t i;
1656
Victor Stinnera0702ab2011-09-29 14:14:38 +02001657 for (i=0; i < how_many; i++) {
1658 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001659 if (ch > to_maxchar)
1660 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001661 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1662 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001663 }
1664 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001665 return 0;
1666}
1667
Victor Stinnerd3f08822012-05-29 12:57:52 +02001668void
1669_PyUnicode_FastCopyCharacters(
1670 PyObject *to, Py_ssize_t to_start,
1671 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001672{
1673 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1674}
1675
1676Py_ssize_t
1677PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1678 PyObject *from, Py_ssize_t from_start,
1679 Py_ssize_t how_many)
1680{
1681 int err;
1682
1683 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1684 PyErr_BadInternalCall();
1685 return -1;
1686 }
1687
Benjamin Petersonbac79492012-01-14 13:34:47 -05001688 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001689 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001690 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001691 return -1;
1692
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001693 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001694 PyErr_SetString(PyExc_IndexError, "string index out of range");
1695 return -1;
1696 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001697 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001698 PyErr_SetString(PyExc_IndexError, "string index out of range");
1699 return -1;
1700 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001701 if (how_many < 0) {
1702 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1703 return -1;
1704 }
1705 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001706 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1707 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001708 "Cannot write %zi characters at %zi "
1709 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001710 how_many, to_start, PyUnicode_GET_LENGTH(to));
1711 return -1;
1712 }
1713
1714 if (how_many == 0)
1715 return 0;
1716
Victor Stinner488fa492011-12-12 00:01:39 +01001717 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001718 return -1;
1719
1720 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1721 if (err) {
1722 PyErr_Format(PyExc_SystemError,
1723 "Cannot copy %s characters "
1724 "into a string of %s characters",
1725 unicode_kind_name(from),
1726 unicode_kind_name(to));
1727 return -1;
1728 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001729 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730}
1731
Victor Stinner17222162011-09-28 22:15:37 +02001732/* Find the maximum code point and count the number of surrogate pairs so a
1733 correct string length can be computed before converting a string to UCS4.
1734 This function counts single surrogates as a character and not as a pair.
1735
1736 Return 0 on success, or -1 on error. */
1737static int
1738find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1739 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740{
1741 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001742 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743
Victor Stinnerc53be962011-10-02 21:33:54 +02001744 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 *num_surrogates = 0;
1746 *maxchar = 0;
1747
1748 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001750 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1751 && (iter+1) < end
1752 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1753 {
1754 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1755 ++(*num_surrogates);
1756 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 }
1758 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001760 {
1761 ch = *iter;
1762 iter++;
1763 }
1764 if (ch > *maxchar) {
1765 *maxchar = ch;
1766 if (*maxchar > MAX_UNICODE) {
1767 PyErr_Format(PyExc_ValueError,
1768 "character U+%x is not in range [U+0000; U+10ffff]",
1769 ch);
1770 return -1;
1771 }
1772 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 }
1774 return 0;
1775}
1776
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001777int
1778_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779{
1780 wchar_t *end;
1781 Py_UCS4 maxchar = 0;
1782 Py_ssize_t num_surrogates;
1783#if SIZEOF_WCHAR_T == 2
1784 Py_ssize_t length_wo_surrogates;
1785#endif
1786
Georg Brandl7597add2011-10-05 16:36:47 +02001787 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001788 strings were created using _PyObject_New() and where no canonical
1789 representation (the str field) has been set yet aka strings
1790 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001791 assert(_PyUnicode_CHECK(unicode));
1792 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001794 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001795 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001796 /* Actually, it should neither be interned nor be anything else: */
1797 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001800 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001801 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803
1804 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001805 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1806 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 PyErr_NoMemory();
1808 return -1;
1809 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001810 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 _PyUnicode_WSTR(unicode), end,
1812 PyUnicode_1BYTE_DATA(unicode));
1813 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1814 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1815 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1816 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001817 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001818 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001819 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 }
1821 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001822 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001823 _PyUnicode_UTF8(unicode) = NULL;
1824 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 }
1826 PyObject_FREE(_PyUnicode_WSTR(unicode));
1827 _PyUnicode_WSTR(unicode) = NULL;
1828 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1829 }
1830 /* In this case we might have to convert down from 4-byte native
1831 wchar_t to 2-byte unicode. */
1832 else if (maxchar < 65536) {
1833 assert(num_surrogates == 0 &&
1834 "FindMaxCharAndNumSurrogatePairs() messed up");
1835
Victor Stinner506f5922011-09-28 22:34:18 +02001836#if SIZEOF_WCHAR_T == 2
1837 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001838 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001839 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1840 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1841 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001842 _PyUnicode_UTF8(unicode) = NULL;
1843 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001844#else
1845 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001846 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001847 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001848 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001849 PyErr_NoMemory();
1850 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 }
Victor Stinner506f5922011-09-28 22:34:18 +02001852 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1853 _PyUnicode_WSTR(unicode), end,
1854 PyUnicode_2BYTE_DATA(unicode));
1855 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1856 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1857 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001858 _PyUnicode_UTF8(unicode) = NULL;
1859 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001860 PyObject_FREE(_PyUnicode_WSTR(unicode));
1861 _PyUnicode_WSTR(unicode) = NULL;
1862 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1863#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 }
1865 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1866 else {
1867#if SIZEOF_WCHAR_T == 2
1868 /* in case the native representation is 2-bytes, we need to allocate a
1869 new normalized 4-byte version. */
1870 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001871 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1872 PyErr_NoMemory();
1873 return -1;
1874 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001875 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1876 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001877 PyErr_NoMemory();
1878 return -1;
1879 }
1880 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1881 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001882 _PyUnicode_UTF8(unicode) = NULL;
1883 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001884 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1885 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001886 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001887 PyObject_FREE(_PyUnicode_WSTR(unicode));
1888 _PyUnicode_WSTR(unicode) = NULL;
1889 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1890#else
1891 assert(num_surrogates == 0);
1892
Victor Stinnerc3c74152011-10-02 20:39:55 +02001893 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001895 _PyUnicode_UTF8(unicode) = NULL;
1896 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1898#endif
1899 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1900 }
1901 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001902 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 return 0;
1904}
1905
Alexander Belopolsky40018472011-02-26 01:02:56 +00001906static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001907unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908{
Walter Dörwald16807132007-05-25 13:52:07 +00001909 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001910 case SSTATE_NOT_INTERNED:
1911 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001912
Benjamin Peterson29060642009-01-31 22:14:21 +00001913 case SSTATE_INTERNED_MORTAL:
1914 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001915 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001916 if (PyDict_DelItem(interned, unicode) != 0) {
1917 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1918 NULL);
1919 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001920 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001921
Benjamin Peterson29060642009-01-31 22:14:21 +00001922 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001923 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1924 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001925
Benjamin Peterson29060642009-01-31 22:14:21 +00001926 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001927 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001928 }
1929
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001930 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001931 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001932 }
1933 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001934 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001935 }
1936 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001937 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001940 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941}
1942
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001943#ifdef Py_DEBUG
1944static int
1945unicode_is_singleton(PyObject *unicode)
1946{
1947 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1948 if (unicode == unicode_empty)
1949 return 1;
1950 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1951 {
1952 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1953 if (ch < 256 && unicode_latin1[ch] == unicode)
1954 return 1;
1955 }
1956 return 0;
1957}
1958#endif
1959
Alexander Belopolsky40018472011-02-26 01:02:56 +00001960static int
Victor Stinner488fa492011-12-12 00:01:39 +01001961unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001962{
Victor Stinner488fa492011-12-12 00:01:39 +01001963 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001964 if (Py_REFCNT(unicode) != 1)
1965 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001966 if (_PyUnicode_HASH(unicode) != -1)
1967 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001968 if (PyUnicode_CHECK_INTERNED(unicode))
1969 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001970 if (!PyUnicode_CheckExact(unicode))
1971 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001972#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001973 /* singleton refcount is greater than 1 */
1974 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001975#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001976 return 1;
1977}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001978
Victor Stinnerfe226c02011-10-03 03:52:20 +02001979static int
1980unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1981{
1982 PyObject *unicode;
1983 Py_ssize_t old_length;
1984
1985 assert(p_unicode != NULL);
1986 unicode = *p_unicode;
1987
1988 assert(unicode != NULL);
1989 assert(PyUnicode_Check(unicode));
1990 assert(0 <= length);
1991
Victor Stinner910337b2011-10-03 03:20:16 +02001992 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001993 old_length = PyUnicode_WSTR_LENGTH(unicode);
1994 else
1995 old_length = PyUnicode_GET_LENGTH(unicode);
1996 if (old_length == length)
1997 return 0;
1998
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001999 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002000 _Py_INCREF_UNICODE_EMPTY();
2001 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00002002 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002003 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002004 return 0;
2005 }
2006
Victor Stinner488fa492011-12-12 00:01:39 +01002007 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002008 PyObject *copy = resize_copy(unicode, length);
2009 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002010 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002011 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002012 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002013 }
2014
Victor Stinnerfe226c02011-10-03 03:52:20 +02002015 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002016 PyObject *new_unicode = resize_compact(unicode, length);
2017 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002018 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002019 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002020 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002021 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002022 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002023}
2024
Alexander Belopolsky40018472011-02-26 01:02:56 +00002025int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002026PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002027{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002028 PyObject *unicode;
2029 if (p_unicode == NULL) {
2030 PyErr_BadInternalCall();
2031 return -1;
2032 }
2033 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002034 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002035 {
2036 PyErr_BadInternalCall();
2037 return -1;
2038 }
2039 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002040}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002041
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002042/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002043
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002044 WARNING: The function doesn't copy the terminating null character and
2045 doesn't check the maximum character (may write a latin1 character in an
2046 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002047static void
2048unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2049 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002050{
2051 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002052 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002053 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002054
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002055 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002056 switch (kind) {
2057 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002058#ifdef Py_DEBUG
2059 if (PyUnicode_IS_ASCII(unicode)) {
2060 Py_UCS4 maxchar = ucs1lib_find_max_char(
2061 (const Py_UCS1*)str,
2062 (const Py_UCS1*)str + len);
2063 assert(maxchar < 128);
2064 }
2065#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002066 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002067 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002068 }
2069 case PyUnicode_2BYTE_KIND: {
2070 Py_UCS2 *start = (Py_UCS2 *)data + index;
2071 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002072
Victor Stinner184252a2012-06-16 02:57:41 +02002073 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002074 *ucs2 = (Py_UCS2)*str;
2075
2076 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002077 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002078 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002079 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002080 Py_UCS4 *start = (Py_UCS4 *)data + index;
2081 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002082
Victor Stinner184252a2012-06-16 02:57:41 +02002083 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002084 *ucs4 = (Py_UCS4)*str;
2085
2086 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002087 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002088 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002089 default:
2090 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002091 }
2092}
2093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002094static PyObject*
2095get_latin1_char(unsigned char ch)
2096{
Victor Stinnera464fc12011-10-02 20:39:30 +02002097 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002099 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100 if (!unicode)
2101 return NULL;
2102 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002103 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002104 unicode_latin1[ch] = unicode;
2105 }
2106 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002107 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002108}
2109
Victor Stinner985a82a2014-01-03 12:53:47 +01002110static PyObject*
2111unicode_char(Py_UCS4 ch)
2112{
2113 PyObject *unicode;
2114
2115 assert(ch <= MAX_UNICODE);
2116
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002117 if (ch < 256)
2118 return get_latin1_char(ch);
2119
Victor Stinner985a82a2014-01-03 12:53:47 +01002120 unicode = PyUnicode_New(1, ch);
2121 if (unicode == NULL)
2122 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002123
2124 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2125 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002126 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002127 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002128 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2129 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2130 }
2131 assert(_PyUnicode_CheckConsistency(unicode, 1));
2132 return unicode;
2133}
2134
Alexander Belopolsky40018472011-02-26 01:02:56 +00002135PyObject *
2136PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002138 if (u == NULL)
2139 return (PyObject*)_PyUnicode_New(size);
2140
2141 if (size < 0) {
2142 PyErr_BadInternalCall();
2143 return NULL;
2144 }
2145
2146 return PyUnicode_FromWideChar(u, size);
2147}
2148
2149PyObject *
2150PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2151{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002152 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 Py_UCS4 maxchar = 0;
2154 Py_ssize_t num_surrogates;
2155
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002156 if (u == NULL && size != 0) {
2157 PyErr_BadInternalCall();
2158 return NULL;
2159 }
2160
2161 if (size == -1) {
2162 size = wcslen(u);
2163 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002165 /* If the Unicode data is known at construction time, we can apply
2166 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002169 if (size == 0)
2170 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 /* Single character Unicode objects in the Latin-1 range are
2173 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002174 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002175 return get_latin1_char((unsigned char)*u);
2176
2177 /* If not empty and not single character, copy the Unicode data
2178 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002179 if (find_maxchar_surrogates(u, u + size,
2180 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 return NULL;
2182
Victor Stinner8faf8212011-12-08 22:14:11 +01002183 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 if (!unicode)
2185 return NULL;
2186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 switch (PyUnicode_KIND(unicode)) {
2188 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002189 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2191 break;
2192 case PyUnicode_2BYTE_KIND:
2193#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002194 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002196 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2198#endif
2199 break;
2200 case PyUnicode_4BYTE_KIND:
2201#if SIZEOF_WCHAR_T == 2
2202 /* This is the only case which has to process surrogates, thus
2203 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002204 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205#else
2206 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002207 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208#endif
2209 break;
2210 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002211 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002214 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215}
2216
Alexander Belopolsky40018472011-02-26 01:02:56 +00002217PyObject *
2218PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002219{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 if (size < 0) {
2221 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002222 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002223 return NULL;
2224 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002225 if (u != NULL)
2226 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2227 else
2228 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002229}
2230
Alexander Belopolsky40018472011-02-26 01:02:56 +00002231PyObject *
2232PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002233{
2234 size_t size = strlen(u);
2235 if (size > PY_SSIZE_T_MAX) {
2236 PyErr_SetString(PyExc_OverflowError, "input too long");
2237 return NULL;
2238 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002239 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002240}
2241
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002242PyObject *
2243_PyUnicode_FromId(_Py_Identifier *id)
2244{
2245 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002246 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2247 strlen(id->string),
2248 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002249 if (!id->object)
2250 return NULL;
2251 PyUnicode_InternInPlace(&id->object);
2252 assert(!id->next);
2253 id->next = static_strings;
2254 static_strings = id;
2255 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002256 return id->object;
2257}
2258
2259void
2260_PyUnicode_ClearStaticStrings()
2261{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002262 _Py_Identifier *tmp, *s = static_strings;
2263 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002264 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002265 tmp = s->next;
2266 s->next = NULL;
2267 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002268 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002269 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002270}
2271
Benjamin Peterson0df54292012-03-26 14:50:32 -04002272/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002273
Victor Stinnerd3f08822012-05-29 12:57:52 +02002274PyObject*
2275_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002276{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002277 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002278 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002279 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002280#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002281 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002282#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002283 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002284 }
Victor Stinner785938e2011-12-11 20:09:03 +01002285 unicode = PyUnicode_New(size, 127);
2286 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002287 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002288 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2289 assert(_PyUnicode_CheckConsistency(unicode, 1));
2290 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002291}
2292
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002293static Py_UCS4
2294kind_maxchar_limit(unsigned int kind)
2295{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002296 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002297 case PyUnicode_1BYTE_KIND:
2298 return 0x80;
2299 case PyUnicode_2BYTE_KIND:
2300 return 0x100;
2301 case PyUnicode_4BYTE_KIND:
2302 return 0x10000;
2303 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002304 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002305 }
2306}
2307
Victor Stinner702c7342011-10-05 13:50:52 +02002308static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002309_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002310{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002312 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002313
Serhiy Storchaka678db842013-01-26 12:16:36 +02002314 if (size == 0)
2315 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002316 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002317 if (size == 1)
2318 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002319
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002320 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002321 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322 if (!res)
2323 return NULL;
2324 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002325 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002326 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002327}
2328
Victor Stinnere57b1c02011-09-28 22:20:48 +02002329static PyObject*
2330_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002331{
2332 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002333 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002334
Serhiy Storchaka678db842013-01-26 12:16:36 +02002335 if (size == 0)
2336 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002337 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002338 if (size == 1)
2339 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002340
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002341 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002342 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 if (!res)
2344 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002345 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002346 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002347 else {
2348 _PyUnicode_CONVERT_BYTES(
2349 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2350 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002351 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352 return res;
2353}
2354
Victor Stinnere57b1c02011-09-28 22:20:48 +02002355static PyObject*
2356_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357{
2358 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002359 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002360
Serhiy Storchaka678db842013-01-26 12:16:36 +02002361 if (size == 0)
2362 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002363 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002364 if (size == 1)
2365 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002366
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002367 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002368 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 if (!res)
2370 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002371 if (max_char < 256)
2372 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2373 PyUnicode_1BYTE_DATA(res));
2374 else if (max_char < 0x10000)
2375 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2376 PyUnicode_2BYTE_DATA(res));
2377 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002379 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 return res;
2381}
2382
2383PyObject*
2384PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2385{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002386 if (size < 0) {
2387 PyErr_SetString(PyExc_ValueError, "size must be positive");
2388 return NULL;
2389 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002390 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002392 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002394 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002396 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002397 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002398 PyErr_SetString(PyExc_SystemError, "invalid kind");
2399 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002401}
2402
Victor Stinnerece58de2012-04-23 23:36:38 +02002403Py_UCS4
2404_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2405{
2406 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002407 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002408
2409 assert(PyUnicode_IS_READY(unicode));
2410 assert(0 <= start);
2411 assert(end <= PyUnicode_GET_LENGTH(unicode));
2412 assert(start <= end);
2413
2414 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2415 return PyUnicode_MAX_CHAR_VALUE(unicode);
2416
2417 if (start == end)
2418 return 127;
2419
Victor Stinner94d558b2012-04-27 22:26:58 +02002420 if (PyUnicode_IS_ASCII(unicode))
2421 return 127;
2422
Victor Stinnerece58de2012-04-23 23:36:38 +02002423 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002424 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002425 endptr = (char *)startptr + end * kind;
2426 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002427 switch(kind) {
2428 case PyUnicode_1BYTE_KIND:
2429 return ucs1lib_find_max_char(startptr, endptr);
2430 case PyUnicode_2BYTE_KIND:
2431 return ucs2lib_find_max_char(startptr, endptr);
2432 case PyUnicode_4BYTE_KIND:
2433 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002434 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002435 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002436 }
2437}
2438
Victor Stinner25a4b292011-10-06 12:31:55 +02002439/* Ensure that a string uses the most efficient storage, if it is not the
2440 case: create a new string with of the right kind. Write NULL into *p_unicode
2441 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002442static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002443unicode_adjust_maxchar(PyObject **p_unicode)
2444{
2445 PyObject *unicode, *copy;
2446 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002447 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002448 unsigned int kind;
2449
2450 assert(p_unicode != NULL);
2451 unicode = *p_unicode;
2452 assert(PyUnicode_IS_READY(unicode));
2453 if (PyUnicode_IS_ASCII(unicode))
2454 return;
2455
2456 len = PyUnicode_GET_LENGTH(unicode);
2457 kind = PyUnicode_KIND(unicode);
2458 if (kind == PyUnicode_1BYTE_KIND) {
2459 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002460 max_char = ucs1lib_find_max_char(u, u + len);
2461 if (max_char >= 128)
2462 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002463 }
2464 else if (kind == PyUnicode_2BYTE_KIND) {
2465 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002466 max_char = ucs2lib_find_max_char(u, u + len);
2467 if (max_char >= 256)
2468 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002469 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002470 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002471 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002472 max_char = ucs4lib_find_max_char(u, u + len);
2473 if (max_char >= 0x10000)
2474 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002475 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002476 else
2477 Py_UNREACHABLE();
2478
Victor Stinner25a4b292011-10-06 12:31:55 +02002479 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002480 if (copy != NULL)
2481 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002482 Py_DECREF(unicode);
2483 *p_unicode = copy;
2484}
2485
Victor Stinner034f6cf2011-09-30 02:26:44 +02002486PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002487_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002488{
Victor Stinner87af4f22011-11-21 23:03:47 +01002489 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002490 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002491
Victor Stinner034f6cf2011-09-30 02:26:44 +02002492 if (!PyUnicode_Check(unicode)) {
2493 PyErr_BadInternalCall();
2494 return NULL;
2495 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002496 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002497 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002498
Victor Stinner87af4f22011-11-21 23:03:47 +01002499 length = PyUnicode_GET_LENGTH(unicode);
2500 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002501 if (!copy)
2502 return NULL;
2503 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2504
Christian Heimesf051e432016-09-13 20:22:02 +02002505 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002506 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002507 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002508 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002509}
2510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511
Victor Stinnerbc603d12011-10-02 01:00:40 +02002512/* Widen Unicode objects to larger buffers. Don't write terminating null
2513 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002514
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002515static void*
2516unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002518 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002519
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002520 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002521 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002522 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002523 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002524 if (!result)
2525 return PyErr_NoMemory();
2526 assert(skind == PyUnicode_1BYTE_KIND);
2527 _PyUnicode_CONVERT_BYTES(
2528 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002529 (const Py_UCS1 *)data,
2530 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002531 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002533 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002534 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002535 if (!result)
2536 return PyErr_NoMemory();
2537 if (skind == PyUnicode_2BYTE_KIND) {
2538 _PyUnicode_CONVERT_BYTES(
2539 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002540 (const Py_UCS2 *)data,
2541 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002542 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002544 else {
2545 assert(skind == PyUnicode_1BYTE_KIND);
2546 _PyUnicode_CONVERT_BYTES(
2547 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002548 (const Py_UCS1 *)data,
2549 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002550 result);
2551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002553 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002554 Py_UNREACHABLE();
2555 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557}
2558
2559static Py_UCS4*
2560as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2561 int copy_null)
2562{
2563 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002564 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 Py_ssize_t len, targetlen;
2566 if (PyUnicode_READY(string) == -1)
2567 return NULL;
2568 kind = PyUnicode_KIND(string);
2569 data = PyUnicode_DATA(string);
2570 len = PyUnicode_GET_LENGTH(string);
2571 targetlen = len;
2572 if (copy_null)
2573 targetlen++;
2574 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002575 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 if (!target) {
2577 PyErr_NoMemory();
2578 return NULL;
2579 }
2580 }
2581 else {
2582 if (targetsize < targetlen) {
2583 PyErr_Format(PyExc_SystemError,
2584 "string is longer than the buffer");
2585 if (copy_null && 0 < targetsize)
2586 target[0] = 0;
2587 return NULL;
2588 }
2589 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002590 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002591 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002592 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002594 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002595 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002596 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2597 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002598 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002599 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002600 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002601 else {
2602 Py_UNREACHABLE();
2603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 if (copy_null)
2605 target[len] = 0;
2606 return target;
2607}
2608
2609Py_UCS4*
2610PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2611 int copy_null)
2612{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002613 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 PyErr_BadInternalCall();
2615 return NULL;
2616 }
2617 return as_ucs4(string, target, targetsize, copy_null);
2618}
2619
2620Py_UCS4*
2621PyUnicode_AsUCS4Copy(PyObject *string)
2622{
2623 return as_ucs4(string, NULL, 0, 1);
2624}
2625
Victor Stinner15a11362012-10-06 23:48:20 +02002626/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002627 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2628 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2629#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002630
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002631static int
2632unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2633 Py_ssize_t width, Py_ssize_t precision)
2634{
2635 Py_ssize_t length, fill, arglen;
2636 Py_UCS4 maxchar;
2637
2638 if (PyUnicode_READY(str) == -1)
2639 return -1;
2640
2641 length = PyUnicode_GET_LENGTH(str);
2642 if ((precision == -1 || precision >= length)
2643 && width <= length)
2644 return _PyUnicodeWriter_WriteStr(writer, str);
2645
2646 if (precision != -1)
2647 length = Py_MIN(precision, length);
2648
2649 arglen = Py_MAX(length, width);
2650 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2651 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2652 else
2653 maxchar = writer->maxchar;
2654
2655 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2656 return -1;
2657
2658 if (width > length) {
2659 fill = width - length;
2660 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2661 return -1;
2662 writer->pos += fill;
2663 }
2664
2665 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2666 str, 0, length);
2667 writer->pos += length;
2668 return 0;
2669}
2670
2671static int
Victor Stinner998b8062018-09-12 00:23:25 +02002672unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002673 Py_ssize_t width, Py_ssize_t precision)
2674{
2675 /* UTF-8 */
2676 Py_ssize_t length;
2677 PyObject *unicode;
2678 int res;
2679
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002680 if (precision == -1) {
2681 length = strlen(str);
2682 }
2683 else {
2684 length = 0;
2685 while (length < precision && str[length]) {
2686 length++;
2687 }
2688 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002689 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2690 if (unicode == NULL)
2691 return -1;
2692
2693 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2694 Py_DECREF(unicode);
2695 return res;
2696}
2697
Victor Stinner96865452011-03-01 23:44:09 +00002698static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002699unicode_fromformat_arg(_PyUnicodeWriter *writer,
2700 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002701{
Victor Stinnere215d962012-10-06 23:03:36 +02002702 const char *p;
2703 Py_ssize_t len;
2704 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002705 Py_ssize_t width;
2706 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002707 int longflag;
2708 int longlongflag;
2709 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002710 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002711
2712 p = f;
2713 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002714 zeropad = 0;
2715 if (*f == '0') {
2716 zeropad = 1;
2717 f++;
2718 }
Victor Stinner96865452011-03-01 23:44:09 +00002719
2720 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002721 width = -1;
2722 if (Py_ISDIGIT((unsigned)*f)) {
2723 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002724 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002725 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002726 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002727 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002729 return NULL;
2730 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002731 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002732 f++;
2733 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002734 }
2735 precision = -1;
2736 if (*f == '.') {
2737 f++;
2738 if (Py_ISDIGIT((unsigned)*f)) {
2739 precision = (*f - '0');
2740 f++;
2741 while (Py_ISDIGIT((unsigned)*f)) {
2742 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2743 PyErr_SetString(PyExc_ValueError,
2744 "precision too big");
2745 return NULL;
2746 }
2747 precision = (precision * 10) + (*f - '0');
2748 f++;
2749 }
2750 }
Victor Stinner96865452011-03-01 23:44:09 +00002751 if (*f == '%') {
2752 /* "%.3%s" => f points to "3" */
2753 f--;
2754 }
2755 }
2756 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002757 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002758 f--;
2759 }
Victor Stinner96865452011-03-01 23:44:09 +00002760
2761 /* Handle %ld, %lu, %lld and %llu. */
2762 longflag = 0;
2763 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002764 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002765 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002766 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002767 longflag = 1;
2768 ++f;
2769 }
Victor Stinner96865452011-03-01 23:44:09 +00002770 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002771 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002772 longlongflag = 1;
2773 f += 2;
2774 }
Victor Stinner96865452011-03-01 23:44:09 +00002775 }
2776 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002777 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002778 size_tflag = 1;
2779 ++f;
2780 }
Victor Stinnere215d962012-10-06 23:03:36 +02002781
2782 if (f[1] == '\0')
2783 writer->overallocate = 0;
2784
2785 switch (*f) {
2786 case 'c':
2787 {
2788 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002789 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002790 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002791 "character argument not in range(0x110000)");
2792 return NULL;
2793 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002794 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002795 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002796 break;
2797 }
2798
2799 case 'i':
2800 case 'd':
2801 case 'u':
2802 case 'x':
2803 {
2804 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002805 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002806 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002807
2808 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002809 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002810 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002811 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002812 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002813 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002814 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002815 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002816 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002817 va_arg(*vargs, size_t));
2818 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002819 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002820 va_arg(*vargs, unsigned int));
2821 }
2822 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002823 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002824 }
2825 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002826 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002827 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002828 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002829 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002830 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002831 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002832 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002833 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002834 va_arg(*vargs, Py_ssize_t));
2835 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002836 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002837 va_arg(*vargs, int));
2838 }
2839 assert(len >= 0);
2840
Victor Stinnere215d962012-10-06 23:03:36 +02002841 if (precision < len)
2842 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002843
2844 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002845 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2846 return NULL;
2847
Victor Stinnere215d962012-10-06 23:03:36 +02002848 if (width > precision) {
2849 Py_UCS4 fillchar;
2850 fill = width - precision;
2851 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002852 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2853 return NULL;
2854 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002855 }
Victor Stinner15a11362012-10-06 23:48:20 +02002856 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002857 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002858 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2859 return NULL;
2860 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002861 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002862
Victor Stinner4a587072013-11-19 12:54:53 +01002863 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2864 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002865 break;
2866 }
2867
2868 case 'p':
2869 {
2870 char number[MAX_LONG_LONG_CHARS];
2871
2872 len = sprintf(number, "%p", va_arg(*vargs, void*));
2873 assert(len >= 0);
2874
2875 /* %p is ill-defined: ensure leading 0x. */
2876 if (number[1] == 'X')
2877 number[1] = 'x';
2878 else if (number[1] != 'x') {
2879 memmove(number + 2, number,
2880 strlen(number) + 1);
2881 number[0] = '0';
2882 number[1] = 'x';
2883 len += 2;
2884 }
2885
Victor Stinner4a587072013-11-19 12:54:53 +01002886 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002887 return NULL;
2888 break;
2889 }
2890
2891 case 's':
2892 {
2893 /* UTF-8 */
2894 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002895 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002896 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002897 break;
2898 }
2899
2900 case 'U':
2901 {
2902 PyObject *obj = va_arg(*vargs, PyObject *);
2903 assert(obj && _PyUnicode_CHECK(obj));
2904
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002905 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002906 return NULL;
2907 break;
2908 }
2909
2910 case 'V':
2911 {
2912 PyObject *obj = va_arg(*vargs, PyObject *);
2913 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002914 if (obj) {
2915 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002916 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002917 return NULL;
2918 }
2919 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002920 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002921 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002922 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002923 }
2924 break;
2925 }
2926
2927 case 'S':
2928 {
2929 PyObject *obj = va_arg(*vargs, PyObject *);
2930 PyObject *str;
2931 assert(obj);
2932 str = PyObject_Str(obj);
2933 if (!str)
2934 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002935 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002936 Py_DECREF(str);
2937 return NULL;
2938 }
2939 Py_DECREF(str);
2940 break;
2941 }
2942
2943 case 'R':
2944 {
2945 PyObject *obj = va_arg(*vargs, PyObject *);
2946 PyObject *repr;
2947 assert(obj);
2948 repr = PyObject_Repr(obj);
2949 if (!repr)
2950 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002951 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002952 Py_DECREF(repr);
2953 return NULL;
2954 }
2955 Py_DECREF(repr);
2956 break;
2957 }
2958
2959 case 'A':
2960 {
2961 PyObject *obj = va_arg(*vargs, PyObject *);
2962 PyObject *ascii;
2963 assert(obj);
2964 ascii = PyObject_ASCII(obj);
2965 if (!ascii)
2966 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002967 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002968 Py_DECREF(ascii);
2969 return NULL;
2970 }
2971 Py_DECREF(ascii);
2972 break;
2973 }
2974
2975 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002976 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002977 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002978 break;
2979
2980 default:
2981 /* if we stumble upon an unknown formatting code, copy the rest
2982 of the format string to the output string. (we cannot just
2983 skip the code, since there's no way to know what's in the
2984 argument list) */
2985 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002986 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002987 return NULL;
2988 f = p+len;
2989 return f;
2990 }
2991
2992 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002993 return f;
2994}
2995
Walter Dörwaldd2034312007-05-18 16:29:38 +00002996PyObject *
2997PyUnicode_FromFormatV(const char *format, va_list vargs)
2998{
Victor Stinnere215d962012-10-06 23:03:36 +02002999 va_list vargs2;
3000 const char *f;
3001 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003002
Victor Stinner8f674cc2013-04-17 23:02:17 +02003003 _PyUnicodeWriter_Init(&writer);
3004 writer.min_length = strlen(format) + 100;
3005 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003006
Benjamin Peterson0c212142016-09-20 20:39:33 -07003007 // Copy varags to be able to pass a reference to a subfunction.
3008 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003009
3010 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003011 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003012 f = unicode_fromformat_arg(&writer, f, &vargs2);
3013 if (f == NULL)
3014 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003016 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003017 const char *p;
3018 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003019
Victor Stinnere215d962012-10-06 23:03:36 +02003020 p = f;
3021 do
3022 {
3023 if ((unsigned char)*p > 127) {
3024 PyErr_Format(PyExc_ValueError,
3025 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3026 "string, got a non-ASCII byte: 0x%02x",
3027 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003028 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003029 }
3030 p++;
3031 }
3032 while (*p != '\0' && *p != '%');
3033 len = p - f;
3034
3035 if (*p == '\0')
3036 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003037
3038 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003039 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003040
3041 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003042 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003043 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003044 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003045 return _PyUnicodeWriter_Finish(&writer);
3046
3047 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003048 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003049 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003050 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003051}
3052
Walter Dörwaldd2034312007-05-18 16:29:38 +00003053PyObject *
3054PyUnicode_FromFormat(const char *format, ...)
3055{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003056 PyObject* ret;
3057 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003058
3059#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003060 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003061#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003062 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003063#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003064 ret = PyUnicode_FromFormatV(format, vargs);
3065 va_end(vargs);
3066 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003067}
3068
Serhiy Storchakac46db922018-10-23 22:58:24 +03003069static Py_ssize_t
3070unicode_get_widechar_size(PyObject *unicode)
3071{
3072 Py_ssize_t res;
3073
3074 assert(unicode != NULL);
3075 assert(_PyUnicode_CHECK(unicode));
3076
3077 if (_PyUnicode_WSTR(unicode) != NULL) {
3078 return PyUnicode_WSTR_LENGTH(unicode);
3079 }
3080 assert(PyUnicode_IS_READY(unicode));
3081
3082 res = _PyUnicode_LENGTH(unicode);
3083#if SIZEOF_WCHAR_T == 2
3084 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3085 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3086 const Py_UCS4 *end = s + res;
3087 for (; s < end; ++s) {
3088 if (*s > 0xFFFF) {
3089 ++res;
3090 }
3091 }
3092 }
3093#endif
3094 return res;
3095}
3096
3097static void
3098unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3099{
3100 const wchar_t *wstr;
3101
3102 assert(unicode != NULL);
3103 assert(_PyUnicode_CHECK(unicode));
3104
3105 wstr = _PyUnicode_WSTR(unicode);
3106 if (wstr != NULL) {
3107 memcpy(w, wstr, size * sizeof(wchar_t));
3108 return;
3109 }
3110 assert(PyUnicode_IS_READY(unicode));
3111
3112 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3113 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3114 for (; size--; ++s, ++w) {
3115 *w = *s;
3116 }
3117 }
3118 else {
3119#if SIZEOF_WCHAR_T == 4
3120 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3121 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3122 for (; size--; ++s, ++w) {
3123 *w = *s;
3124 }
3125#else
3126 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3127 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3128 for (; size--; ++s, ++w) {
3129 Py_UCS4 ch = *s;
3130 if (ch > 0xFFFF) {
3131 assert(ch <= MAX_UNICODE);
3132 /* encode surrogate pair in this case */
3133 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3134 if (!size--)
3135 break;
3136 *w = Py_UNICODE_LOW_SURROGATE(ch);
3137 }
3138 else {
3139 *w = ch;
3140 }
3141 }
3142#endif
3143 }
3144}
3145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003146#ifdef HAVE_WCHAR_H
3147
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003148/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003149
Victor Stinnerd88d9832011-09-06 02:00:05 +02003150 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003151 character) required to convert the unicode object. Ignore size argument.
3152
Victor Stinnerd88d9832011-09-06 02:00:05 +02003153 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003154 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003155 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003156Py_ssize_t
3157PyUnicode_AsWideChar(PyObject *unicode,
3158 wchar_t *w,
3159 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003160{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003161 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003162
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003163 if (unicode == NULL) {
3164 PyErr_BadInternalCall();
3165 return -1;
3166 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003167 if (!PyUnicode_Check(unicode)) {
3168 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003169 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003170 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003171
3172 res = unicode_get_widechar_size(unicode);
3173 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003174 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003175 }
3176
3177 if (size > res) {
3178 size = res + 1;
3179 }
3180 else {
3181 res = size;
3182 }
3183 unicode_copy_as_widechar(unicode, w, size);
3184 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003185}
3186
Victor Stinner137c34c2010-09-29 10:25:54 +00003187wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003188PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003189 Py_ssize_t *size)
3190{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003191 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003192 Py_ssize_t buflen;
3193
3194 if (unicode == NULL) {
3195 PyErr_BadInternalCall();
3196 return NULL;
3197 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003198 if (!PyUnicode_Check(unicode)) {
3199 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003200 return NULL;
3201 }
3202
Serhiy Storchakac46db922018-10-23 22:58:24 +03003203 buflen = unicode_get_widechar_size(unicode);
3204 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003205 if (buffer == NULL) {
3206 PyErr_NoMemory();
3207 return NULL;
3208 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003209 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3210 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003211 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003212 }
3213 else if (wcslen(buffer) != (size_t)buflen) {
3214 PyMem_FREE(buffer);
3215 PyErr_SetString(PyExc_ValueError,
3216 "embedded null character");
3217 return NULL;
3218 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003219 return buffer;
3220}
3221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003222#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223
Alexander Belopolsky40018472011-02-26 01:02:56 +00003224PyObject *
3225PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003226{
Victor Stinner8faf8212011-12-08 22:14:11 +01003227 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 PyErr_SetString(PyExc_ValueError,
3229 "chr() arg not in range(0x110000)");
3230 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003231 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003232
Victor Stinner985a82a2014-01-03 12:53:47 +01003233 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003234}
3235
Alexander Belopolsky40018472011-02-26 01:02:56 +00003236PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003237PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003239 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003241 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003242 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003243 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 Py_INCREF(obj);
3245 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003246 }
3247 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 /* For a Unicode subtype that's not a Unicode object,
3249 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003250 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003251 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003252 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003253 "Can't convert '%.100s' object to str implicitly",
3254 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003255 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003256}
3257
Alexander Belopolsky40018472011-02-26 01:02:56 +00003258PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003259PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003260 const char *encoding,
3261 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003262{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003263 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003264 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003265
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 PyErr_BadInternalCall();
3268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003270
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003271 /* Decoding bytes objects is the most common case and should be fast */
3272 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003273 if (PyBytes_GET_SIZE(obj) == 0) {
3274 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3275 return NULL;
3276 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003277 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003278 }
3279 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003280 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3281 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003282 }
3283
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003284 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 PyErr_SetString(PyExc_TypeError,
3286 "decoding str is not supported");
3287 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003288 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003289
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003290 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3291 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3292 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003293 "decoding to str: need a bytes-like object, %.80s found",
3294 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003295 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003296 }
Tim Petersced69f82003-09-16 20:30:58 +00003297
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003298 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003299 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003300 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3301 return NULL;
3302 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003303 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003305
Serhiy Storchaka05997252013-01-26 12:14:02 +02003306 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003307 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003308 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309}
3310
Victor Stinnerebe17e02016-10-12 13:57:45 +02003311/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3312 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3313 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003314int
3315_Py_normalize_encoding(const char *encoding,
3316 char *lower,
3317 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003319 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003320 char *l;
3321 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003322 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003323
Victor Stinner942889a2016-09-05 15:40:10 -07003324 assert(encoding != NULL);
3325
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003326 e = encoding;
3327 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003328 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003329 punct = 0;
3330 while (1) {
3331 char c = *e;
3332 if (c == 0) {
3333 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003334 }
Victor Stinner942889a2016-09-05 15:40:10 -07003335
3336 if (Py_ISALNUM(c) || c == '.') {
3337 if (punct && l != lower) {
3338 if (l == l_end) {
3339 return 0;
3340 }
3341 *l++ = '_';
3342 }
3343 punct = 0;
3344
3345 if (l == l_end) {
3346 return 0;
3347 }
3348 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003349 }
3350 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003351 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003352 }
Victor Stinner942889a2016-09-05 15:40:10 -07003353
3354 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003355 }
3356 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003357 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003358}
3359
Alexander Belopolsky40018472011-02-26 01:02:56 +00003360PyObject *
3361PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003362 Py_ssize_t size,
3363 const char *encoding,
3364 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003365{
3366 PyObject *buffer = NULL, *unicode;
3367 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003368 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3369
Victor Stinner22eb6892019-06-26 00:51:05 +02003370 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3371 return NULL;
3372 }
3373
Victor Stinnered076ed2019-06-26 01:49:32 +02003374 if (size == 0) {
3375 _Py_RETURN_UNICODE_EMPTY();
3376 }
3377
Victor Stinner942889a2016-09-05 15:40:10 -07003378 if (encoding == NULL) {
3379 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3380 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003381
Fred Drakee4315f52000-05-09 19:53:39 +00003382 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003383 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3384 char *lower = buflower;
3385
3386 /* Fast paths */
3387 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3388 lower += 3;
3389 if (*lower == '_') {
3390 /* Match "utf8" and "utf_8" */
3391 lower++;
3392 }
3393
3394 if (lower[0] == '8' && lower[1] == 0) {
3395 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3396 }
3397 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3398 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3399 }
3400 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3401 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3402 }
3403 }
3404 else {
3405 if (strcmp(lower, "ascii") == 0
3406 || strcmp(lower, "us_ascii") == 0) {
3407 return PyUnicode_DecodeASCII(s, size, errors);
3408 }
Steve Dowercc16be82016-09-08 10:35:16 -07003409 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003410 else if (strcmp(lower, "mbcs") == 0) {
3411 return PyUnicode_DecodeMBCS(s, size, errors);
3412 }
3413 #endif
3414 else if (strcmp(lower, "latin1") == 0
3415 || strcmp(lower, "latin_1") == 0
3416 || strcmp(lower, "iso_8859_1") == 0
3417 || strcmp(lower, "iso8859_1") == 0) {
3418 return PyUnicode_DecodeLatin1(s, size, errors);
3419 }
3420 }
Victor Stinner37296e82010-06-10 13:36:23 +00003421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422
3423 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003424 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003425 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003426 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003427 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 if (buffer == NULL)
3429 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003430 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431 if (unicode == NULL)
3432 goto onError;
3433 if (!PyUnicode_Check(unicode)) {
3434 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003435 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003436 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003437 encoding,
3438 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003439 Py_DECREF(unicode);
3440 goto onError;
3441 }
3442 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003443 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003444
Benjamin Peterson29060642009-01-31 22:14:21 +00003445 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446 Py_XDECREF(buffer);
3447 return NULL;
3448}
3449
Alexander Belopolsky40018472011-02-26 01:02:56 +00003450PyObject *
3451PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003452 const char *encoding,
3453 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003454{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003455 if (!PyUnicode_Check(unicode)) {
3456 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003457 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003458 }
3459
Serhiy Storchaka00939072016-10-27 21:05:49 +03003460 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3461 "PyUnicode_AsDecodedObject() is deprecated; "
3462 "use PyCodec_Decode() to decode from str", 1) < 0)
3463 return NULL;
3464
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003465 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003466 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003467
3468 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003469 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003470}
3471
Alexander Belopolsky40018472011-02-26 01:02:56 +00003472PyObject *
3473PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003474 const char *encoding,
3475 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003476{
3477 PyObject *v;
3478
3479 if (!PyUnicode_Check(unicode)) {
3480 PyErr_BadArgument();
3481 goto onError;
3482 }
3483
Serhiy Storchaka00939072016-10-27 21:05:49 +03003484 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3485 "PyUnicode_AsDecodedUnicode() is deprecated; "
3486 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3487 return NULL;
3488
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003489 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003490 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003491
3492 /* Decode via the codec registry */
3493 v = PyCodec_Decode(unicode, encoding, errors);
3494 if (v == NULL)
3495 goto onError;
3496 if (!PyUnicode_Check(v)) {
3497 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003498 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003499 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003500 encoding,
3501 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003502 Py_DECREF(v);
3503 goto onError;
3504 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003505 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003506
Benjamin Peterson29060642009-01-31 22:14:21 +00003507 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003508 return NULL;
3509}
3510
Alexander Belopolsky40018472011-02-26 01:02:56 +00003511PyObject *
3512PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003513 Py_ssize_t size,
3514 const char *encoding,
3515 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516{
3517 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003518
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003519 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3523 Py_DECREF(unicode);
3524 return v;
3525}
3526
Alexander Belopolsky40018472011-02-26 01:02:56 +00003527PyObject *
3528PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003529 const char *encoding,
3530 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003531{
3532 PyObject *v;
3533
3534 if (!PyUnicode_Check(unicode)) {
3535 PyErr_BadArgument();
3536 goto onError;
3537 }
3538
Serhiy Storchaka00939072016-10-27 21:05:49 +03003539 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3540 "PyUnicode_AsEncodedObject() is deprecated; "
3541 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3542 "or PyCodec_Encode() for generic encoding", 1) < 0)
3543 return NULL;
3544
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003545 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003546 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003547
3548 /* Encode via the codec registry */
3549 v = PyCodec_Encode(unicode, encoding, errors);
3550 if (v == NULL)
3551 goto onError;
3552 return v;
3553
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003555 return NULL;
3556}
3557
Victor Stinner1b579672011-12-17 05:47:23 +01003558
Victor Stinner2cba6b82018-01-10 22:46:15 +01003559static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003560unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003561 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003562{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003563 Py_ssize_t wlen;
3564 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3565 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003566 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003567 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003568
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003569 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003570 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003571 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003572 return NULL;
3573 }
3574
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003575 char *str;
3576 size_t error_pos;
3577 const char *reason;
3578 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003579 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003580 PyMem_Free(wstr);
3581
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003582 if (res != 0) {
3583 if (res == -2) {
3584 PyObject *exc;
3585 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3586 "locale", unicode,
3587 (Py_ssize_t)error_pos,
3588 (Py_ssize_t)(error_pos+1),
3589 reason);
3590 if (exc != NULL) {
3591 PyCodec_StrictErrors(exc);
3592 Py_DECREF(exc);
3593 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003594 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003595 else if (res == -3) {
3596 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3597 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003598 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003599 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003600 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003601 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003602 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003603
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003604 PyObject *bytes = PyBytes_FromString(str);
3605 PyMem_RawFree(str);
3606 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003607}
3608
Victor Stinnerad158722010-10-27 00:25:46 +00003609PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003610PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3611{
Victor Stinner709d23d2019-05-02 14:56:30 -04003612 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3613 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003614}
3615
3616PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003617PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003618{
Victor Stinner81a7be32020-04-14 15:14:01 +02003619 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003620 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003621 return unicode_encode_utf8(unicode,
3622 interp->fs_codec.error_handler,
3623 interp->fs_codec.errors);
3624 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003625#ifndef _Py_FORCE_UTF8_FS_ENCODING
3626 else if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003627 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003628 interp->fs_codec.encoding,
3629 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003630 }
Victor Stinnerad158722010-10-27 00:25:46 +00003631#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003632 else {
3633 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3634 machinery is not ready and so cannot be used:
3635 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003636 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3637 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003638 assert(filesystem_errors != NULL);
3639 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3640 assert(errors != _Py_ERROR_UNKNOWN);
3641#ifdef _Py_FORCE_UTF8_FS_ENCODING
3642 return unicode_encode_utf8(unicode, errors, NULL);
3643#else
3644 return unicode_encode_locale(unicode, errors, 0);
3645#endif
3646 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003647}
3648
Alexander Belopolsky40018472011-02-26 01:02:56 +00003649PyObject *
3650PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003651 const char *encoding,
3652 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653{
3654 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003655 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003656
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 if (!PyUnicode_Check(unicode)) {
3658 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 }
Fred Drakee4315f52000-05-09 19:53:39 +00003661
Victor Stinner22eb6892019-06-26 00:51:05 +02003662 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3663 return NULL;
3664 }
3665
Victor Stinner942889a2016-09-05 15:40:10 -07003666 if (encoding == NULL) {
3667 return _PyUnicode_AsUTF8String(unicode, errors);
3668 }
3669
Fred Drakee4315f52000-05-09 19:53:39 +00003670 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003671 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3672 char *lower = buflower;
3673
3674 /* Fast paths */
3675 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3676 lower += 3;
3677 if (*lower == '_') {
3678 /* Match "utf8" and "utf_8" */
3679 lower++;
3680 }
3681
3682 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003683 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003684 }
3685 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3686 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3687 }
3688 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3689 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3690 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003691 }
Victor Stinner942889a2016-09-05 15:40:10 -07003692 else {
3693 if (strcmp(lower, "ascii") == 0
3694 || strcmp(lower, "us_ascii") == 0) {
3695 return _PyUnicode_AsASCIIString(unicode, errors);
3696 }
Steve Dowercc16be82016-09-08 10:35:16 -07003697#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003698 else if (strcmp(lower, "mbcs") == 0) {
3699 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3700 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003701#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003702 else if (strcmp(lower, "latin1") == 0 ||
3703 strcmp(lower, "latin_1") == 0 ||
3704 strcmp(lower, "iso_8859_1") == 0 ||
3705 strcmp(lower, "iso8859_1") == 0) {
3706 return _PyUnicode_AsLatin1String(unicode, errors);
3707 }
3708 }
Victor Stinner37296e82010-06-10 13:36:23 +00003709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710
3711 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003712 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003714 return NULL;
3715
3716 /* The normal path */
3717 if (PyBytes_Check(v))
3718 return v;
3719
3720 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003721 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003722 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003723 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003724
3725 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003726 "encoder %s returned bytearray instead of bytes; "
3727 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003728 encoding);
3729 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003730 Py_DECREF(v);
3731 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003732 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003733
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003734 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3735 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003736 Py_DECREF(v);
3737 return b;
3738 }
3739
3740 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003741 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003742 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003743 encoding,
3744 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003745 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003746 return NULL;
3747}
3748
Alexander Belopolsky40018472011-02-26 01:02:56 +00003749PyObject *
3750PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003751 const char *encoding,
3752 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003753{
3754 PyObject *v;
3755
3756 if (!PyUnicode_Check(unicode)) {
3757 PyErr_BadArgument();
3758 goto onError;
3759 }
3760
Serhiy Storchaka00939072016-10-27 21:05:49 +03003761 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3762 "PyUnicode_AsEncodedUnicode() is deprecated; "
3763 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3764 return NULL;
3765
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003766 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003767 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003768
3769 /* Encode via the codec registry */
3770 v = PyCodec_Encode(unicode, encoding, errors);
3771 if (v == NULL)
3772 goto onError;
3773 if (!PyUnicode_Check(v)) {
3774 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003775 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003776 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003777 encoding,
3778 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003779 Py_DECREF(v);
3780 goto onError;
3781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003783
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 return NULL;
3786}
3787
Victor Stinner2cba6b82018-01-10 22:46:15 +01003788static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003789unicode_decode_locale(const char *str, Py_ssize_t len,
3790 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003791{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003792 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3793 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003794 return NULL;
3795 }
3796
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003797 wchar_t *wstr;
3798 size_t wlen;
3799 const char *reason;
3800 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003801 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003802 if (res != 0) {
3803 if (res == -2) {
3804 PyObject *exc;
3805 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3806 "locale", str, len,
3807 (Py_ssize_t)wlen,
3808 (Py_ssize_t)(wlen + 1),
3809 reason);
3810 if (exc != NULL) {
3811 PyCodec_StrictErrors(exc);
3812 Py_DECREF(exc);
3813 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003814 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003815 else if (res == -3) {
3816 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3817 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003818 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003819 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003820 }
Victor Stinner2f197072011-12-17 07:08:30 +01003821 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003822 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003823
3824 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3825 PyMem_RawFree(wstr);
3826 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003827}
3828
3829PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003830PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3831 const char *errors)
3832{
Victor Stinner709d23d2019-05-02 14:56:30 -04003833 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3834 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003835}
3836
3837PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003838PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003839{
3840 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003841 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3842 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003843}
3844
3845
3846PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003847PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003848 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003849 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3850}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003851
Christian Heimes5894ba72007-11-04 11:43:14 +00003852PyObject*
3853PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3854{
Victor Stinner81a7be32020-04-14 15:14:01 +02003855 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003856 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003857 return unicode_decode_utf8(s, size,
3858 interp->fs_codec.error_handler,
3859 interp->fs_codec.errors,
3860 NULL);
3861 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003862#ifndef _Py_FORCE_UTF8_FS_ENCODING
3863 else if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003864 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003865 interp->fs_codec.encoding,
3866 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003867 }
Victor Stinnerad158722010-10-27 00:25:46 +00003868#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003869 else {
3870 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3871 machinery is not ready and so cannot be used:
3872 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003873 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3874 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003875 assert(filesystem_errors != NULL);
3876 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3877 assert(errors != _Py_ERROR_UNKNOWN);
3878#ifdef _Py_FORCE_UTF8_FS_ENCODING
3879 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3880#else
3881 return unicode_decode_locale(s, size, errors, 0);
3882#endif
3883 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003884}
3885
Martin v. Löwis011e8422009-05-05 04:43:17 +00003886
3887int
3888PyUnicode_FSConverter(PyObject* arg, void* addr)
3889{
Brett Cannonec6ce872016-09-06 15:50:29 -07003890 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003891 PyObject *output = NULL;
3892 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003893 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003894 if (arg == NULL) {
3895 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003896 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003897 return 1;
3898 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003899 path = PyOS_FSPath(arg);
3900 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003901 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003902 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003903 if (PyBytes_Check(path)) {
3904 output = path;
3905 }
3906 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3907 output = PyUnicode_EncodeFSDefault(path);
3908 Py_DECREF(path);
3909 if (!output) {
3910 return 0;
3911 }
3912 assert(PyBytes_Check(output));
3913 }
3914
Victor Stinner0ea2a462010-04-30 00:22:08 +00003915 size = PyBytes_GET_SIZE(output);
3916 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003917 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003918 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003919 Py_DECREF(output);
3920 return 0;
3921 }
3922 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003923 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003924}
3925
3926
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003927int
3928PyUnicode_FSDecoder(PyObject* arg, void* addr)
3929{
Brett Cannona5711202016-09-06 19:36:01 -07003930 int is_buffer = 0;
3931 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003932 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003933 if (arg == NULL) {
3934 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003935 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003936 return 1;
3937 }
Brett Cannona5711202016-09-06 19:36:01 -07003938
3939 is_buffer = PyObject_CheckBuffer(arg);
3940 if (!is_buffer) {
3941 path = PyOS_FSPath(arg);
3942 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003943 return 0;
3944 }
Brett Cannona5711202016-09-06 19:36:01 -07003945 }
3946 else {
3947 path = arg;
3948 Py_INCREF(arg);
3949 }
3950
3951 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003952 output = path;
3953 }
3954 else if (PyBytes_Check(path) || is_buffer) {
3955 PyObject *path_bytes = NULL;
3956
3957 if (!PyBytes_Check(path) &&
3958 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003959 "path should be string, bytes, or os.PathLike, not %.200s",
3960 Py_TYPE(arg)->tp_name)) {
3961 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003962 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003963 }
3964 path_bytes = PyBytes_FromObject(path);
3965 Py_DECREF(path);
3966 if (!path_bytes) {
3967 return 0;
3968 }
3969 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3970 PyBytes_GET_SIZE(path_bytes));
3971 Py_DECREF(path_bytes);
3972 if (!output) {
3973 return 0;
3974 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003975 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003976 else {
3977 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003978 "path should be string, bytes, or os.PathLike, not %.200s",
3979 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003980 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003981 return 0;
3982 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003983 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003984 Py_DECREF(output);
3985 return 0;
3986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003988 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003989 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003990 Py_DECREF(output);
3991 return 0;
3992 }
3993 *(PyObject**)addr = output;
3994 return Py_CLEANUP_SUPPORTED;
3995}
3996
3997
Inada Naoki02a4d572020-02-27 13:48:59 +09003998static int unicode_fill_utf8(PyObject *unicode);
3999
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004000const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004002{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004003 if (!PyUnicode_Check(unicode)) {
4004 PyErr_BadArgument();
4005 return NULL;
4006 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004007 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004008 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004010 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004011 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 return NULL;
4013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 }
4015
4016 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004017 *psize = PyUnicode_UTF8_LENGTH(unicode);
4018 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004019}
4020
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004021const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004022PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4025}
4026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027Py_UNICODE *
4028PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 if (!PyUnicode_Check(unicode)) {
4031 PyErr_BadArgument();
4032 return NULL;
4033 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004034 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4035 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004037 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004038 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039
Serhiy Storchakac46db922018-10-23 22:58:24 +03004040 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4041 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4042 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004045 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4046 if (w == NULL) {
4047 PyErr_NoMemory();
4048 return NULL;
4049 }
4050 unicode_copy_as_widechar(unicode, w, wlen + 1);
4051 _PyUnicode_WSTR(unicode) = w;
4052 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4053 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 }
4055 }
4056 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004057 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004058 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004059}
4060
Alexander Belopolsky40018472011-02-26 01:02:56 +00004061Py_UNICODE *
4062PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065}
4066
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004067const Py_UNICODE *
4068_PyUnicode_AsUnicode(PyObject *unicode)
4069{
4070 Py_ssize_t size;
4071 const Py_UNICODE *wstr;
4072
4073 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4074 if (wstr && wcslen(wstr) != (size_t)size) {
4075 PyErr_SetString(PyExc_ValueError, "embedded null character");
4076 return NULL;
4077 }
4078 return wstr;
4079}
4080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004081
Alexander Belopolsky40018472011-02-26 01:02:56 +00004082Py_ssize_t
4083PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084{
4085 if (!PyUnicode_Check(unicode)) {
4086 PyErr_BadArgument();
4087 goto onError;
4088 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004089 if (_PyUnicode_WSTR(unicode) == NULL) {
4090 if (PyUnicode_AsUnicode(unicode) == NULL)
4091 goto onError;
4092 }
4093 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094
Benjamin Peterson29060642009-01-31 22:14:21 +00004095 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 return -1;
4097}
4098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099Py_ssize_t
4100PyUnicode_GetLength(PyObject *unicode)
4101{
Victor Stinner07621332012-06-16 04:53:46 +02004102 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004103 PyErr_BadArgument();
4104 return -1;
4105 }
Victor Stinner07621332012-06-16 04:53:46 +02004106 if (PyUnicode_READY(unicode) == -1)
4107 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108 return PyUnicode_GET_LENGTH(unicode);
4109}
4110
4111Py_UCS4
4112PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4113{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004114 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004115 int kind;
4116
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004117 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004118 PyErr_BadArgument();
4119 return (Py_UCS4)-1;
4120 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004121 if (PyUnicode_READY(unicode) == -1) {
4122 return (Py_UCS4)-1;
4123 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004124 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004125 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004126 return (Py_UCS4)-1;
4127 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004128 data = PyUnicode_DATA(unicode);
4129 kind = PyUnicode_KIND(unicode);
4130 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131}
4132
4133int
4134PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4135{
4136 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004137 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138 return -1;
4139 }
Victor Stinner488fa492011-12-12 00:01:39 +01004140 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004141 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004142 PyErr_SetString(PyExc_IndexError, "string index out of range");
4143 return -1;
4144 }
Victor Stinner488fa492011-12-12 00:01:39 +01004145 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004146 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004147 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4148 PyErr_SetString(PyExc_ValueError, "character out of range");
4149 return -1;
4150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004151 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4152 index, ch);
4153 return 0;
4154}
4155
Alexander Belopolsky40018472011-02-26 01:02:56 +00004156const char *
4157PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004158{
Victor Stinner42cb4622010-09-01 19:39:01 +00004159 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004160}
4161
Victor Stinner554f3f02010-06-16 23:33:54 +00004162/* create or adjust a UnicodeDecodeError */
4163static void
4164make_decode_exception(PyObject **exceptionObject,
4165 const char *encoding,
4166 const char *input, Py_ssize_t length,
4167 Py_ssize_t startpos, Py_ssize_t endpos,
4168 const char *reason)
4169{
4170 if (*exceptionObject == NULL) {
4171 *exceptionObject = PyUnicodeDecodeError_Create(
4172 encoding, input, length, startpos, endpos, reason);
4173 }
4174 else {
4175 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4176 goto onError;
4177 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4178 goto onError;
4179 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4180 goto onError;
4181 }
4182 return;
4183
4184onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004185 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004186}
4187
Steve Dowercc16be82016-09-08 10:35:16 -07004188#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004189static int
4190widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4191{
4192 if (newsize > *size) {
4193 wchar_t *newbuf = *buf;
4194 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4195 PyErr_NoMemory();
4196 return -1;
4197 }
4198 *buf = newbuf;
4199 }
4200 *size = newsize;
4201 return 0;
4202}
4203
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204/* error handling callback helper:
4205 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004206 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 and adjust various state variables.
4208 return 0 on success, -1 on error
4209*/
4210
Alexander Belopolsky40018472011-02-26 01:02:56 +00004211static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004212unicode_decode_call_errorhandler_wchar(
4213 const char *errors, PyObject **errorHandler,
4214 const char *encoding, const char *reason,
4215 const char **input, const char **inend, Py_ssize_t *startinpos,
4216 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004217 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004219 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220
4221 PyObject *restuple = NULL;
4222 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004223 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004224 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004225 Py_ssize_t requiredsize;
4226 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004227 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004228 wchar_t *repwstr;
4229 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230
4231 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004232 *errorHandler = PyCodec_LookupError(errors);
4233 if (*errorHandler == NULL)
4234 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 }
4236
Victor Stinner554f3f02010-06-16 23:33:54 +00004237 make_decode_exception(exceptionObject,
4238 encoding,
4239 *input, *inend - *input,
4240 *startinpos, *endinpos,
4241 reason);
4242 if (*exceptionObject == NULL)
4243 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244
Petr Viktorinffd97532020-02-11 17:46:57 +01004245 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004249 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004252 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004254
4255 /* Copy back the bytes variables, which might have been modified by the
4256 callback */
4257 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4258 if (!inputobj)
4259 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004260 *input = PyBytes_AS_STRING(inputobj);
4261 insize = PyBytes_GET_SIZE(inputobj);
4262 *inend = *input + insize;
4263 /* we can DECREF safely, as the exception has another reference,
4264 so the object won't go away. */
4265 Py_DECREF(inputobj);
4266
4267 if (newpos<0)
4268 newpos = insize+newpos;
4269 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004270 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004271 goto onError;
4272 }
4273
4274 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4275 if (repwstr == NULL)
4276 goto onError;
4277 /* need more space? (at least enough for what we
4278 have+the replacement+the rest of the string (starting
4279 at the new input position), so we won't have to check space
4280 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004281 requiredsize = *outpos;
4282 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4283 goto overflow;
4284 requiredsize += repwlen;
4285 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4286 goto overflow;
4287 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004288 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004289 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004290 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004291 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004292 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004293 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004294 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004296 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298 *endinpos = newpos;
4299 *inptr = *input + newpos;
4300
4301 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004302 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 return 0;
4304
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004305 overflow:
4306 PyErr_SetString(PyExc_OverflowError,
4307 "decoded result is too long for a Python string");
4308
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309 onError:
4310 Py_XDECREF(restuple);
4311 return -1;
4312}
Steve Dowercc16be82016-09-08 10:35:16 -07004313#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314
4315static int
4316unicode_decode_call_errorhandler_writer(
4317 const char *errors, PyObject **errorHandler,
4318 const char *encoding, const char *reason,
4319 const char **input, const char **inend, Py_ssize_t *startinpos,
4320 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4321 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4322{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004323 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004324
4325 PyObject *restuple = NULL;
4326 PyObject *repunicode = NULL;
4327 Py_ssize_t insize;
4328 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004329 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004330 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004332 int need_to_grow = 0;
4333 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004334
4335 if (*errorHandler == NULL) {
4336 *errorHandler = PyCodec_LookupError(errors);
4337 if (*errorHandler == NULL)
4338 goto onError;
4339 }
4340
4341 make_decode_exception(exceptionObject,
4342 encoding,
4343 *input, *inend - *input,
4344 *startinpos, *endinpos,
4345 reason);
4346 if (*exceptionObject == NULL)
4347 goto onError;
4348
Petr Viktorinffd97532020-02-11 17:46:57 +01004349 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 if (restuple == NULL)
4351 goto onError;
4352 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004353 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 goto onError;
4355 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004356 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004357 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004358
4359 /* Copy back the bytes variables, which might have been modified by the
4360 callback */
4361 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4362 if (!inputobj)
4363 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004364 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004365 *input = PyBytes_AS_STRING(inputobj);
4366 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004367 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004368 /* we can DECREF safely, as the exception has another reference,
4369 so the object won't go away. */
4370 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004371
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004373 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004374 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004375 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004377 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378
Victor Stinner170ca6f2013-04-18 00:25:28 +02004379 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004380 if (replen > 1) {
4381 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004382 need_to_grow = 1;
4383 }
4384 new_inptr = *input + newpos;
4385 if (*inend - new_inptr > remain) {
4386 /* We don't know the decoding algorithm here so we make the worst
4387 assumption that one byte decodes to one unicode character.
4388 If unfortunately one byte could decode to more unicode characters,
4389 the decoder may write out-of-bound then. Is it possible for the
4390 algorithms using this function? */
4391 writer->min_length += *inend - new_inptr - remain;
4392 need_to_grow = 1;
4393 }
4394 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004395 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004396 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004397 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4398 goto onError;
4399 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004400 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004401 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004402
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004404 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004407 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004408 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004412 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413}
4414
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415/* --- UTF-7 Codec -------------------------------------------------------- */
4416
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417/* See RFC2152 for details. We encode conservatively and decode liberally. */
4418
4419/* Three simple macros defining base-64. */
4420
4421/* Is c a base-64 character? */
4422
4423#define IS_BASE64(c) \
4424 (((c) >= 'A' && (c) <= 'Z') || \
4425 ((c) >= 'a' && (c) <= 'z') || \
4426 ((c) >= '0' && (c) <= '9') || \
4427 (c) == '+' || (c) == '/')
4428
4429/* given that c is a base-64 character, what is its base-64 value? */
4430
4431#define FROM_BASE64(c) \
4432 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4433 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4434 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4435 (c) == '+' ? 62 : 63)
4436
4437/* What is the base-64 character of the bottom 6 bits of n? */
4438
4439#define TO_BASE64(n) \
4440 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4441
4442/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4443 * decoded as itself. We are permissive on decoding; the only ASCII
4444 * byte not decoding to itself is the + which begins a base64
4445 * string. */
4446
4447#define DECODE_DIRECT(c) \
4448 ((c) <= 127 && (c) != '+')
4449
4450/* The UTF-7 encoder treats ASCII characters differently according to
4451 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4452 * the above). See RFC2152. This array identifies these different
4453 * sets:
4454 * 0 : "Set D"
4455 * alphanumeric and '(),-./:?
4456 * 1 : "Set O"
4457 * !"#$%&*;<=>@[]^_`{|}
4458 * 2 : "whitespace"
4459 * ht nl cr sp
4460 * 3 : special (must be base64 encoded)
4461 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4462 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004463
Tim Petersced69f82003-09-16 20:30:58 +00004464static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465char utf7_category[128] = {
4466/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4467 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4468/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4469 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4470/* sp ! " # $ % & ' ( ) * + , - . / */
4471 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4472/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4473 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4474/* @ A B C D E F G H I J K L M N O */
4475 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4476/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4477 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4478/* ` a b c d e f g h i j k l m n o */
4479 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4480/* p q r s t u v w x y z { | } ~ del */
4481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004482};
4483
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484/* ENCODE_DIRECT: this character should be encoded as itself. The
4485 * answer depends on whether we are encoding set O as itself, and also
4486 * on whether we are encoding whitespace as itself. RFC2152 makes it
4487 * clear that the answers to these questions vary between
4488 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004489
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490#define ENCODE_DIRECT(c, directO, directWS) \
4491 ((c) < 128 && (c) > 0 && \
4492 ((utf7_category[(c)] == 0) || \
4493 (directWS && (utf7_category[(c)] == 2)) || \
4494 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004495
Alexander Belopolsky40018472011-02-26 01:02:56 +00004496PyObject *
4497PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004498 Py_ssize_t size,
4499 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004501 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4502}
4503
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504/* The decoder. The only state we preserve is our read position,
4505 * i.e. how many characters we have consumed. So if we end in the
4506 * middle of a shift sequence we have to back off the read position
4507 * and the output to the beginning of the sequence, otherwise we lose
4508 * all the shift state (seen bits, number of bits seen, high
4509 * surrogate). */
4510
Alexander Belopolsky40018472011-02-26 01:02:56 +00004511PyObject *
4512PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004513 Py_ssize_t size,
4514 const char *errors,
4515 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004516{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004518 Py_ssize_t startinpos;
4519 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004521 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522 const char *errmsg = "";
4523 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004524 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 unsigned int base64bits = 0;
4526 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004527 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 PyObject *errorHandler = NULL;
4529 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004531 if (size == 0) {
4532 if (consumed)
4533 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004534 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004535 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004537 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004538 _PyUnicodeWriter_Init(&writer);
4539 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004540
4541 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542 e = s + size;
4543
4544 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004545 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004547 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549 if (inShift) { /* in a base-64 section */
4550 if (IS_BASE64(ch)) { /* consume a base-64 character */
4551 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4552 base64bits += 6;
4553 s++;
4554 if (base64bits >= 16) {
4555 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004556 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 base64bits -= 16;
4558 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004559 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 if (surrogate) {
4561 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004562 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4563 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004564 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004565 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004567 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568 }
4569 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004570 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004571 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
4574 }
Victor Stinner551ac952011-11-29 22:58:13 +01004575 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 /* first surrogate */
4577 surrogate = outCh;
4578 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004580 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004581 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 }
4583 }
4584 }
4585 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004586 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 if (base64bits > 0) { /* left-over bits */
4588 if (base64bits >= 6) {
4589 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004590 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 errmsg = "partial character in shift sequence";
4592 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004593 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 else {
4595 /* Some bits remain; they should be zero */
4596 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004597 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004598 errmsg = "non-zero padding bits in shift sequence";
4599 goto utf7Error;
4600 }
4601 }
4602 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004603 if (surrogate && DECODE_DIRECT(ch)) {
4604 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4605 goto onError;
4606 }
4607 surrogate = 0;
4608 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 /* '-' is absorbed; other terminating
4610 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004611 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004612 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613 }
4614 }
4615 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617 s++; /* consume '+' */
4618 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004620 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004621 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004623 else if (s < e && !IS_BASE64(*s)) {
4624 s++;
4625 errmsg = "ill-formed sequence";
4626 goto utf7Error;
4627 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004629 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004630 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004631 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004633 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004634 }
4635 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004636 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004638 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004639 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641 else {
4642 startinpos = s-starts;
4643 s++;
4644 errmsg = "unexpected special character";
4645 goto utf7Error;
4646 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004648utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004650 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004651 errors, &errorHandler,
4652 "utf7", errmsg,
4653 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004654 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 }
4657
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658 /* end of string */
4659
4660 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4661 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004662 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 if (surrogate ||
4664 (base64bits >= 6) ||
4665 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004666 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004667 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668 errors, &errorHandler,
4669 "utf7", "unterminated shift sequence",
4670 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004671 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004672 goto onError;
4673 if (s < e)
4674 goto restart;
4675 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004676 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677
4678 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004679 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004680 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004681 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004682 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004683 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004684 writer.kind, writer.data, shiftOutStart);
4685 Py_XDECREF(errorHandler);
4686 Py_XDECREF(exc);
4687 _PyUnicodeWriter_Dealloc(&writer);
4688 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004689 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004690 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 }
4692 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004693 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004694 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004695 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 Py_XDECREF(errorHandler);
4698 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004699 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004700
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004702 Py_XDECREF(errorHandler);
4703 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004704 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705 return NULL;
4706}
4707
4708
Alexander Belopolsky40018472011-02-26 01:02:56 +00004709PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004710_PyUnicode_EncodeUTF7(PyObject *str,
4711 int base64SetO,
4712 int base64WhiteSpace,
4713 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004714{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004715 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004716 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004717 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004718 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004720 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004721 unsigned int base64bits = 0;
4722 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004723 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004724 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004725
Benjamin Petersonbac79492012-01-14 13:34:47 -05004726 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004727 return NULL;
4728 kind = PyUnicode_KIND(str);
4729 data = PyUnicode_DATA(str);
4730 len = PyUnicode_GET_LENGTH(str);
4731
4732 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004733 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004734
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004735 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004736 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004737 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004738 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004739 if (v == NULL)
4740 return NULL;
4741
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004742 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004743 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004744 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745
Antoine Pitrou244651a2009-05-04 18:56:13 +00004746 if (inShift) {
4747 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4748 /* shifting out */
4749 if (base64bits) { /* output remaining bits */
4750 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4751 base64buffer = 0;
4752 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753 }
4754 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004755 /* Characters not in the BASE64 set implicitly unshift the sequence
4756 so no '-' is required, except if the character is itself a '-' */
4757 if (IS_BASE64(ch) || ch == '-') {
4758 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004759 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004760 *out++ = (char) ch;
4761 }
4762 else {
4763 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004764 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004766 else { /* not in a shift sequence */
4767 if (ch == '+') {
4768 *out++ = '+';
4769 *out++ = '-';
4770 }
4771 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4772 *out++ = (char) ch;
4773 }
4774 else {
4775 *out++ = '+';
4776 inShift = 1;
4777 goto encode_char;
4778 }
4779 }
4780 continue;
4781encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004782 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004783 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004784
Antoine Pitrou244651a2009-05-04 18:56:13 +00004785 /* code first surrogate */
4786 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004787 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 while (base64bits >= 6) {
4789 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4790 base64bits -= 6;
4791 }
4792 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004793 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004795 base64bits += 16;
4796 base64buffer = (base64buffer << 16) | ch;
4797 while (base64bits >= 6) {
4798 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4799 base64bits -= 6;
4800 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004801 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004802 if (base64bits)
4803 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4804 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004805 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004806 if (_PyBytes_Resize(&v, out - start) < 0)
4807 return NULL;
4808 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004809}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004810PyObject *
4811PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4812 Py_ssize_t size,
4813 int base64SetO,
4814 int base64WhiteSpace,
4815 const char *errors)
4816{
4817 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004818 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004819 if (tmp == NULL)
4820 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004821 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004822 base64WhiteSpace, errors);
4823 Py_DECREF(tmp);
4824 return result;
4825}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004826
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827#undef IS_BASE64
4828#undef FROM_BASE64
4829#undef TO_BASE64
4830#undef DECODE_DIRECT
4831#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004832
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833/* --- UTF-8 Codec -------------------------------------------------------- */
4834
Alexander Belopolsky40018472011-02-26 01:02:56 +00004835PyObject *
4836PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004837 Py_ssize_t size,
4838 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839{
Walter Dörwald69652032004-09-07 20:24:22 +00004840 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4841}
4842
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843#include "stringlib/asciilib.h"
4844#include "stringlib/codecs.h"
4845#include "stringlib/undef.h"
4846
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004847#include "stringlib/ucs1lib.h"
4848#include "stringlib/codecs.h"
4849#include "stringlib/undef.h"
4850
4851#include "stringlib/ucs2lib.h"
4852#include "stringlib/codecs.h"
4853#include "stringlib/undef.h"
4854
4855#include "stringlib/ucs4lib.h"
4856#include "stringlib/codecs.h"
4857#include "stringlib/undef.h"
4858
Antoine Pitrouab868312009-01-10 15:40:25 +00004859/* Mask to quickly check whether a C 'long' contains a
4860 non-ASCII, UTF8-encoded char. */
4861#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004862# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004863#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004864# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004865#else
4866# error C 'long' size should be either 4 or 8!
4867#endif
4868
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004869static Py_ssize_t
4870ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004871{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004872 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004873 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004874
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004875 /*
4876 * Issue #17237: m68k is a bit different from most architectures in
4877 * that objects do not use "natural alignment" - for example, int and
4878 * long are only aligned at 2-byte boundaries. Therefore the assert()
4879 * won't work; also, tests have shown that skipping the "optimised
4880 * version" will even speed up m68k.
4881 */
4882#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004883#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004884 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4885 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004886 /* Fast path, see in STRINGLIB(utf8_decode) for
4887 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004888 /* Help allocation */
4889 const char *_p = p;
4890 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 while (_p < aligned_end) {
4892 unsigned long value = *(const unsigned long *) _p;
4893 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004894 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004895 *((unsigned long *)q) = value;
4896 _p += SIZEOF_LONG;
4897 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004898 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 p = _p;
4900 while (p < end) {
4901 if ((unsigned char)*p & 0x80)
4902 break;
4903 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004907#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004908#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004909 while (p < end) {
4910 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4911 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004912 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004913 /* Help allocation */
4914 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004915 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004916 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 if (value & ASCII_CHAR_MASK)
4918 break;
4919 _p += SIZEOF_LONG;
4920 }
4921 p = _p;
4922 if (_p == end)
4923 break;
4924 }
4925 if ((unsigned char)*p & 0x80)
4926 break;
4927 ++p;
4928 }
4929 memcpy(dest, start, p - start);
4930 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931}
Antoine Pitrouab868312009-01-10 15:40:25 +00004932
Victor Stinner709d23d2019-05-02 14:56:30 -04004933static PyObject *
4934unicode_decode_utf8(const char *s, Py_ssize_t size,
4935 _Py_error_handler error_handler, const char *errors,
4936 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004937{
Victor Stinner785938e2011-12-11 20:09:03 +01004938 if (size == 0) {
4939 if (consumed)
4940 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004941 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004942 }
4943
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4945 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004946 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 *consumed = 1;
4948 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004949 }
4950
Inada Naoki770847a2019-06-24 12:30:24 +09004951 const char *starts = s;
4952 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004953
Inada Naoki770847a2019-06-24 12:30:24 +09004954 // fast path: try ASCII string.
4955 PyObject *u = PyUnicode_New(size, 127);
4956 if (u == NULL) {
4957 return NULL;
4958 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004959 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09004960 if (s == end) {
4961 return u;
4962 }
4963
4964 // Use _PyUnicodeWriter after fast path is failed.
4965 _PyUnicodeWriter writer;
4966 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4967 writer.pos = s - starts;
4968
4969 Py_ssize_t startinpos, endinpos;
4970 const char *errmsg = "";
4971 PyObject *error_handler_obj = NULL;
4972 PyObject *exc = NULL;
4973
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004974 while (s < end) {
4975 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004976 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004977
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004979 if (PyUnicode_IS_ASCII(writer.buffer))
4980 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004981 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004982 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004983 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004984 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004985 } else {
4986 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004988 }
4989
4990 switch (ch) {
4991 case 0:
4992 if (s == end || consumed)
4993 goto End;
4994 errmsg = "unexpected end of data";
4995 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004996 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 break;
4998 case 1:
4999 errmsg = "invalid start byte";
5000 startinpos = s - starts;
5001 endinpos = startinpos + 1;
5002 break;
5003 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005004 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5005 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5006 {
5007 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005008 goto End;
5009 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005010 /* fall through */
5011 case 3:
5012 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 errmsg = "invalid continuation byte";
5014 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005015 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005016 break;
5017 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005018 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005019 goto onError;
5020 continue;
5021 }
5022
Victor Stinner1d65d912015-10-05 13:43:50 +02005023 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005024 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005025
5026 switch (error_handler) {
5027 case _Py_ERROR_IGNORE:
5028 s += (endinpos - startinpos);
5029 break;
5030
5031 case _Py_ERROR_REPLACE:
5032 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5033 goto onError;
5034 s += (endinpos - startinpos);
5035 break;
5036
5037 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005038 {
5039 Py_ssize_t i;
5040
Victor Stinner1d65d912015-10-05 13:43:50 +02005041 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5042 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005043 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005044 ch = (Py_UCS4)(unsigned char)(starts[i]);
5045 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5046 ch + 0xdc00);
5047 writer.pos++;
5048 }
5049 s += (endinpos - startinpos);
5050 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005051 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005052
5053 default:
5054 if (unicode_decode_call_errorhandler_writer(
5055 errors, &error_handler_obj,
5056 "utf-8", errmsg,
5057 &starts, &end, &startinpos, &endinpos, &exc, &s,
5058 &writer))
5059 goto onError;
5060 }
Victor Stinner785938e2011-12-11 20:09:03 +01005061 }
5062
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005063End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005064 if (consumed)
5065 *consumed = s - starts;
5066
Victor Stinner1d65d912015-10-05 13:43:50 +02005067 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005068 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005069 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005070
5071onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005072 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005073 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005074 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005076}
5077
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005078
Victor Stinner709d23d2019-05-02 14:56:30 -04005079PyObject *
5080PyUnicode_DecodeUTF8Stateful(const char *s,
5081 Py_ssize_t size,
5082 const char *errors,
5083 Py_ssize_t *consumed)
5084{
5085 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5086}
5087
5088
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005089/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5090 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005091
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005092 On success, write a pointer to a newly allocated wide character string into
5093 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5094 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005095
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005096 On memory allocation failure, return -1.
5097
5098 On decoding error (if surrogateescape is zero), return -2. If wlen is
5099 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5100 is not NULL, write the decoding error message into *reason. */
5101int
5102_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005103 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005104{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005105 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005107 wchar_t *unicode;
5108 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005109
Victor Stinner3d4226a2018-08-29 22:21:32 +02005110 int surrogateescape = 0;
5111 int surrogatepass = 0;
5112 switch (errors)
5113 {
5114 case _Py_ERROR_STRICT:
5115 break;
5116 case _Py_ERROR_SURROGATEESCAPE:
5117 surrogateescape = 1;
5118 break;
5119 case _Py_ERROR_SURROGATEPASS:
5120 surrogatepass = 1;
5121 break;
5122 default:
5123 return -3;
5124 }
5125
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005126 /* Note: size will always be longer than the resulting Unicode
5127 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005128 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005129 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005130 }
5131
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005132 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005133 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005134 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005135 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005136
5137 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005138 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005139 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005140 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005141 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005142#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005143 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005145 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 if (ch > 0xFF) {
5148#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005149 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005150#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005151 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005152 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5154 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5155#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005156 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005157 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005158 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005159 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005160 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005161
5162 if (surrogateescape) {
5163 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5164 }
5165 else {
5166 /* Is it a valid three-byte code? */
5167 if (surrogatepass
5168 && (e - s) >= 3
5169 && (s[0] & 0xf0) == 0xe0
5170 && (s[1] & 0xc0) == 0x80
5171 && (s[2] & 0xc0) == 0x80)
5172 {
5173 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5174 s += 3;
5175 unicode[outpos++] = ch;
5176 }
5177 else {
5178 PyMem_RawFree(unicode );
5179 if (reason != NULL) {
5180 switch (ch) {
5181 case 0:
5182 *reason = "unexpected end of data";
5183 break;
5184 case 1:
5185 *reason = "invalid start byte";
5186 break;
5187 /* 2, 3, 4 */
5188 default:
5189 *reason = "invalid continuation byte";
5190 break;
5191 }
5192 }
5193 if (wlen != NULL) {
5194 *wlen = s - orig_s;
5195 }
5196 return -2;
5197 }
5198 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005199 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005200 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005201 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005202 if (wlen) {
5203 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005204 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005205 *wstr = unicode;
5206 return 0;
5207}
5208
Victor Stinner5f9cf232019-03-19 01:46:25 +01005209
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005210wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005211_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5212 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005213{
5214 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005215 int res = _Py_DecodeUTF8Ex(arg, arglen,
5216 &wstr, wlen,
5217 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005218 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005219 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5220 assert(res != -3);
5221 if (wlen) {
5222 *wlen = (size_t)res;
5223 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005224 return NULL;
5225 }
5226 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005227}
5228
Antoine Pitrouab868312009-01-10 15:40:25 +00005229
Victor Stinnere47e6982017-12-21 15:45:16 +01005230/* UTF-8 encoder using the surrogateescape error handler .
5231
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005232 On success, return 0 and write the newly allocated character string (use
5233 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005234
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005235 On encoding failure, return -2 and write the position of the invalid
5236 surrogate character into *error_pos (if error_pos is set) and the decoding
5237 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005238
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005239 On memory allocation failure, return -1. */
5240int
5241_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005242 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005243{
5244 const Py_ssize_t max_char_size = 4;
5245 Py_ssize_t len = wcslen(text);
5246
5247 assert(len >= 0);
5248
Victor Stinner3d4226a2018-08-29 22:21:32 +02005249 int surrogateescape = 0;
5250 int surrogatepass = 0;
5251 switch (errors)
5252 {
5253 case _Py_ERROR_STRICT:
5254 break;
5255 case _Py_ERROR_SURROGATEESCAPE:
5256 surrogateescape = 1;
5257 break;
5258 case _Py_ERROR_SURROGATEPASS:
5259 surrogatepass = 1;
5260 break;
5261 default:
5262 return -3;
5263 }
5264
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005265 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5266 return -1;
5267 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005268 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005269 if (raw_malloc) {
5270 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005271 }
5272 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005273 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005274 }
5275 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005276 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005277 }
5278
5279 char *p = bytes;
5280 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005281 for (i = 0; i < len; ) {
5282 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005283 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005284 i++;
5285#if Py_UNICODE_SIZE == 2
5286 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5287 && i < len
5288 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5289 {
5290 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5291 i++;
5292 }
5293#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005294
5295 if (ch < 0x80) {
5296 /* Encode ASCII */
5297 *p++ = (char) ch;
5298
5299 }
5300 else if (ch < 0x0800) {
5301 /* Encode Latin-1 */
5302 *p++ = (char)(0xc0 | (ch >> 6));
5303 *p++ = (char)(0x80 | (ch & 0x3f));
5304 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005305 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005306 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005307 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005308 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005309 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005310 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005311 if (reason != NULL) {
5312 *reason = "encoding error";
5313 }
5314 if (raw_malloc) {
5315 PyMem_RawFree(bytes);
5316 }
5317 else {
5318 PyMem_Free(bytes);
5319 }
5320 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005321 }
5322 *p++ = (char)(ch & 0xff);
5323 }
5324 else if (ch < 0x10000) {
5325 *p++ = (char)(0xe0 | (ch >> 12));
5326 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5327 *p++ = (char)(0x80 | (ch & 0x3f));
5328 }
5329 else { /* ch >= 0x10000 */
5330 assert(ch <= MAX_UNICODE);
5331 /* Encode UCS4 Unicode ordinals */
5332 *p++ = (char)(0xf0 | (ch >> 18));
5333 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5334 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5335 *p++ = (char)(0x80 | (ch & 0x3f));
5336 }
5337 }
5338 *p++ = '\0';
5339
5340 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005341 char *bytes2;
5342 if (raw_malloc) {
5343 bytes2 = PyMem_RawRealloc(bytes, final_size);
5344 }
5345 else {
5346 bytes2 = PyMem_Realloc(bytes, final_size);
5347 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005348 if (bytes2 == NULL) {
5349 if (error_pos != NULL) {
5350 *error_pos = (size_t)-1;
5351 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005352 if (raw_malloc) {
5353 PyMem_RawFree(bytes);
5354 }
5355 else {
5356 PyMem_Free(bytes);
5357 }
5358 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005359 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005360 *str = bytes2;
5361 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005362}
5363
5364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005365/* Primary internal function which creates utf8 encoded bytes objects.
5366
5367 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005368 and allocate exactly as much space needed at the end. Else allocate the
5369 maximum possible needed (4 result bytes per Unicode character), and return
5370 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005371*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005372static PyObject *
5373unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5374 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005376 if (!PyUnicode_Check(unicode)) {
5377 PyErr_BadArgument();
5378 return NULL;
5379 }
5380
5381 if (PyUnicode_READY(unicode) == -1)
5382 return NULL;
5383
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005384 if (PyUnicode_UTF8(unicode))
5385 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5386 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005387
Inada Naoki02a4d572020-02-27 13:48:59 +09005388 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005389 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005390 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5391
5392 _PyBytesWriter writer;
5393 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005394
Benjamin Petersonead6b532011-12-20 17:23:42 -06005395 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005396 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005397 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005398 case PyUnicode_1BYTE_KIND:
5399 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5400 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005401 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5402 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005403 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005404 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5405 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005406 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005407 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5408 break;
Tim Peters602f7402002-04-27 18:03:26 +00005409 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005410
5411 if (end == NULL) {
5412 _PyBytesWriter_Dealloc(&writer);
5413 return NULL;
5414 }
5415 return _PyBytesWriter_Finish(&writer, end);
5416}
5417
5418static int
5419unicode_fill_utf8(PyObject *unicode)
5420{
5421 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5422 assert(!PyUnicode_IS_ASCII(unicode));
5423
5424 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005425 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005426 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5427
5428 _PyBytesWriter writer;
5429 char *end;
5430
5431 switch (kind) {
5432 default:
5433 Py_UNREACHABLE();
5434 case PyUnicode_1BYTE_KIND:
5435 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5436 _Py_ERROR_STRICT, NULL);
5437 break;
5438 case PyUnicode_2BYTE_KIND:
5439 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5440 _Py_ERROR_STRICT, NULL);
5441 break;
5442 case PyUnicode_4BYTE_KIND:
5443 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5444 _Py_ERROR_STRICT, NULL);
5445 break;
5446 }
5447 if (end == NULL) {
5448 _PyBytesWriter_Dealloc(&writer);
5449 return -1;
5450 }
5451
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005452 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005453 PyBytes_AS_STRING(writer.buffer);
5454 Py_ssize_t len = end - start;
5455
5456 char *cache = PyObject_MALLOC(len + 1);
5457 if (cache == NULL) {
5458 _PyBytesWriter_Dealloc(&writer);
5459 PyErr_NoMemory();
5460 return -1;
5461 }
5462 _PyUnicode_UTF8(unicode) = cache;
5463 _PyUnicode_UTF8_LENGTH(unicode) = len;
5464 memcpy(cache, start, len);
5465 cache[len] = '\0';
5466 _PyBytesWriter_Dealloc(&writer);
5467 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468}
5469
Alexander Belopolsky40018472011-02-26 01:02:56 +00005470PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005471_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5472{
5473 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5474}
5475
5476
5477PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5479 Py_ssize_t size,
5480 const char *errors)
5481{
5482 PyObject *v, *unicode;
5483
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005484 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005485 if (unicode == NULL)
5486 return NULL;
5487 v = _PyUnicode_AsUTF8String(unicode, errors);
5488 Py_DECREF(unicode);
5489 return v;
5490}
5491
5492PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005493PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005495 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496}
5497
Walter Dörwald41980ca2007-08-16 21:55:45 +00005498/* --- UTF-32 Codec ------------------------------------------------------- */
5499
5500PyObject *
5501PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 Py_ssize_t size,
5503 const char *errors,
5504 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005505{
5506 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5507}
5508
5509PyObject *
5510PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 Py_ssize_t size,
5512 const char *errors,
5513 int *byteorder,
5514 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005515{
5516 const char *starts = s;
5517 Py_ssize_t startinpos;
5518 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005519 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005520 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005521 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005522 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005523 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005524 PyObject *errorHandler = NULL;
5525 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005526
Andy Lestere6be9b52020-02-11 20:28:35 -06005527 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005528 e = q + size;
5529
5530 if (byteorder)
5531 bo = *byteorder;
5532
5533 /* Check for BOM marks (U+FEFF) in the input and adjust current
5534 byte order setting accordingly. In native mode, the leading BOM
5535 mark is skipped, in all other modes, it is copied to the output
5536 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005537 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005538 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005539 if (bom == 0x0000FEFF) {
5540 bo = -1;
5541 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005543 else if (bom == 0xFFFE0000) {
5544 bo = 1;
5545 q += 4;
5546 }
5547 if (byteorder)
5548 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005549 }
5550
Victor Stinnere64322e2012-10-30 23:12:47 +01005551 if (q == e) {
5552 if (consumed)
5553 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005554 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005555 }
5556
Victor Stinnere64322e2012-10-30 23:12:47 +01005557#ifdef WORDS_BIGENDIAN
5558 le = bo < 0;
5559#else
5560 le = bo <= 0;
5561#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005562 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005563
Victor Stinner8f674cc2013-04-17 23:02:17 +02005564 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005565 writer.min_length = (e - q + 3) / 4;
5566 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005567 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005568
Victor Stinnere64322e2012-10-30 23:12:47 +01005569 while (1) {
5570 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005571 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005572
Victor Stinnere64322e2012-10-30 23:12:47 +01005573 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005574 enum PyUnicode_Kind kind = writer.kind;
5575 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005576 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005577 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005578 if (le) {
5579 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005580 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005581 if (ch > maxch)
5582 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005583 if (kind != PyUnicode_1BYTE_KIND &&
5584 Py_UNICODE_IS_SURROGATE(ch))
5585 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005586 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005587 q += 4;
5588 } while (q <= last);
5589 }
5590 else {
5591 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005592 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005593 if (ch > maxch)
5594 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005595 if (kind != PyUnicode_1BYTE_KIND &&
5596 Py_UNICODE_IS_SURROGATE(ch))
5597 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005598 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005599 q += 4;
5600 } while (q <= last);
5601 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005602 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005603 }
5604
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005605 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005606 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005607 startinpos = ((const char *)q) - starts;
5608 endinpos = startinpos + 4;
5609 }
5610 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005611 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005612 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005613 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005614 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005615 startinpos = ((const char *)q) - starts;
5616 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005618 else {
5619 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005620 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005621 goto onError;
5622 q += 4;
5623 continue;
5624 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005625 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005626 startinpos = ((const char *)q) - starts;
5627 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005629
5630 /* The remaining input chars are ignored if the callback
5631 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005632 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005634 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005636 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005638 }
5639
Walter Dörwald41980ca2007-08-16 21:55:45 +00005640 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005642
Walter Dörwald41980ca2007-08-16 21:55:45 +00005643 Py_XDECREF(errorHandler);
5644 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005645 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005646
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005648 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005649 Py_XDECREF(errorHandler);
5650 Py_XDECREF(exc);
5651 return NULL;
5652}
5653
5654PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005655_PyUnicode_EncodeUTF32(PyObject *str,
5656 const char *errors,
5657 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005658{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005659 enum PyUnicode_Kind kind;
5660 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005661 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005662 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005663 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005664#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005665 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005666#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005667 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005668#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005669 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005670 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005671 PyObject *errorHandler = NULL;
5672 PyObject *exc = NULL;
5673 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005674
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005675 if (!PyUnicode_Check(str)) {
5676 PyErr_BadArgument();
5677 return NULL;
5678 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005679 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005680 return NULL;
5681 kind = PyUnicode_KIND(str);
5682 data = PyUnicode_DATA(str);
5683 len = PyUnicode_GET_LENGTH(str);
5684
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005685 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005686 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005687 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005688 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005689 if (v == NULL)
5690 return NULL;
5691
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005692 /* output buffer is 4-bytes aligned */
5693 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005694 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005695 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005696 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005697 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005698 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005699
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005700 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005701 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005702 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005703 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005704 else
5705 encoding = "utf-32";
5706
5707 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005708 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5709 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005710 }
5711
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005712 pos = 0;
5713 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005714 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005715
5716 if (kind == PyUnicode_2BYTE_KIND) {
5717 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5718 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005719 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005720 else {
5721 assert(kind == PyUnicode_4BYTE_KIND);
5722 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5723 &out, native_ordering);
5724 }
5725 if (pos == len)
5726 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005727
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005728 rep = unicode_encode_call_errorhandler(
5729 errors, &errorHandler,
5730 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005731 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005732 if (!rep)
5733 goto error;
5734
5735 if (PyBytes_Check(rep)) {
5736 repsize = PyBytes_GET_SIZE(rep);
5737 if (repsize & 3) {
5738 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005739 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005740 "surrogates not allowed");
5741 goto error;
5742 }
5743 moreunits = repsize / 4;
5744 }
5745 else {
5746 assert(PyUnicode_Check(rep));
5747 if (PyUnicode_READY(rep) < 0)
5748 goto error;
5749 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5750 if (!PyUnicode_IS_ASCII(rep)) {
5751 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005752 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 "surrogates not allowed");
5754 goto error;
5755 }
5756 }
5757
5758 /* four bytes are reserved for each surrogate */
5759 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005760 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005761 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005762 /* integer overflow */
5763 PyErr_NoMemory();
5764 goto error;
5765 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005766 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005768 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769 }
5770
5771 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005772 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005773 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005774 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005775 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005776 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5777 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005778 }
5779
5780 Py_CLEAR(rep);
5781 }
5782
5783 /* Cut back to size actually needed. This is necessary for, for example,
5784 encoding of a string containing isolated surrogates and the 'ignore'
5785 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005786 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005787 if (nsize != PyBytes_GET_SIZE(v))
5788 _PyBytes_Resize(&v, nsize);
5789 Py_XDECREF(errorHandler);
5790 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005791 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005792 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005793 error:
5794 Py_XDECREF(rep);
5795 Py_XDECREF(errorHandler);
5796 Py_XDECREF(exc);
5797 Py_XDECREF(v);
5798 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005799}
5800
Alexander Belopolsky40018472011-02-26 01:02:56 +00005801PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005802PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5803 Py_ssize_t size,
5804 const char *errors,
5805 int byteorder)
5806{
5807 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005808 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005809 if (tmp == NULL)
5810 return NULL;
5811 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5812 Py_DECREF(tmp);
5813 return result;
5814}
5815
5816PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005817PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005818{
Victor Stinnerb960b342011-11-20 19:12:52 +01005819 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005820}
5821
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822/* --- UTF-16 Codec ------------------------------------------------------- */
5823
Tim Peters772747b2001-08-09 22:21:55 +00005824PyObject *
5825PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 Py_ssize_t size,
5827 const char *errors,
5828 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829{
Walter Dörwald69652032004-09-07 20:24:22 +00005830 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5831}
5832
5833PyObject *
5834PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 Py_ssize_t size,
5836 const char *errors,
5837 int *byteorder,
5838 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005839{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005840 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005841 Py_ssize_t startinpos;
5842 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005843 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005844 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005845 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005846 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005847 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 PyObject *errorHandler = NULL;
5849 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005850 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851
Andy Lestere6be9b52020-02-11 20:28:35 -06005852 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005853 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854
5855 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005856 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005858 /* Check for BOM marks (U+FEFF) in the input and adjust current
5859 byte order setting accordingly. In native mode, the leading BOM
5860 mark is skipped, in all other modes, it is copied to the output
5861 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005862 if (bo == 0 && size >= 2) {
5863 const Py_UCS4 bom = (q[1] << 8) | q[0];
5864 if (bom == 0xFEFF) {
5865 q += 2;
5866 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005868 else if (bom == 0xFFFE) {
5869 q += 2;
5870 bo = 1;
5871 }
5872 if (byteorder)
5873 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005874 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875
Antoine Pitrou63065d72012-05-15 23:48:04 +02005876 if (q == e) {
5877 if (consumed)
5878 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005879 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005880 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005881
Christian Heimes743e0cd2012-10-17 23:52:17 +02005882#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005883 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005884 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005885#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005886 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005887 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005888#endif
Tim Peters772747b2001-08-09 22:21:55 +00005889
Antoine Pitrou63065d72012-05-15 23:48:04 +02005890 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005891 character count normally. Error handler will take care of
5892 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005893 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005894 writer.min_length = (e - q + 1) / 2;
5895 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005896 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005897
Antoine Pitrou63065d72012-05-15 23:48:04 +02005898 while (1) {
5899 Py_UCS4 ch = 0;
5900 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005901 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005902 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005903 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005904 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005905 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005906 native_ordering);
5907 else
5908 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005909 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005910 native_ordering);
5911 } else if (kind == PyUnicode_2BYTE_KIND) {
5912 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005913 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005914 native_ordering);
5915 } else {
5916 assert(kind == PyUnicode_4BYTE_KIND);
5917 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005918 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005919 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005920 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005921 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005922
Antoine Pitrou63065d72012-05-15 23:48:04 +02005923 switch (ch)
5924 {
5925 case 0:
5926 /* remaining byte at the end? (size should be even) */
5927 if (q == e || consumed)
5928 goto End;
5929 errmsg = "truncated data";
5930 startinpos = ((const char *)q) - starts;
5931 endinpos = ((const char *)e) - starts;
5932 break;
5933 /* The remaining input chars are ignored if the callback
5934 chooses to skip the input */
5935 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005936 q -= 2;
5937 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005938 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005939 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005940 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005941 endinpos = ((const char *)e) - starts;
5942 break;
5943 case 2:
5944 errmsg = "illegal encoding";
5945 startinpos = ((const char *)q) - 2 - starts;
5946 endinpos = startinpos + 2;
5947 break;
5948 case 3:
5949 errmsg = "illegal UTF-16 surrogate";
5950 startinpos = ((const char *)q) - 4 - starts;
5951 endinpos = startinpos + 2;
5952 break;
5953 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005954 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005955 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 continue;
5957 }
5958
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005959 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005960 errors,
5961 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005962 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005963 &starts,
5964 (const char **)&e,
5965 &startinpos,
5966 &endinpos,
5967 &exc,
5968 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005969 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 }
5972
Antoine Pitrou63065d72012-05-15 23:48:04 +02005973End:
Walter Dörwald69652032004-09-07 20:24:22 +00005974 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005975 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005976
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005977 Py_XDECREF(errorHandler);
5978 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005979 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005982 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005983 Py_XDECREF(errorHandler);
5984 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 return NULL;
5986}
5987
Tim Peters772747b2001-08-09 22:21:55 +00005988PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005989_PyUnicode_EncodeUTF16(PyObject *str,
5990 const char *errors,
5991 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005993 enum PyUnicode_Kind kind;
5994 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005995 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005996 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005997 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005998 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005999#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006000 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006001#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006002 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006003#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006004 const char *encoding;
6005 Py_ssize_t nsize, pos;
6006 PyObject *errorHandler = NULL;
6007 PyObject *exc = NULL;
6008 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006009
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006010 if (!PyUnicode_Check(str)) {
6011 PyErr_BadArgument();
6012 return NULL;
6013 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006014 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006015 return NULL;
6016 kind = PyUnicode_KIND(str);
6017 data = PyUnicode_DATA(str);
6018 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006019
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006020 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006021 if (kind == PyUnicode_4BYTE_KIND) {
6022 const Py_UCS4 *in = (const Py_UCS4 *)data;
6023 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006024 while (in < end) {
6025 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006026 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006027 }
6028 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006029 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006030 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006032 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006033 nsize = len + pairs + (byteorder == 0);
6034 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006035 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006039 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006040 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006041 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006042 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006043 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006044 }
6045 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006046 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006047 }
Tim Peters772747b2001-08-09 22:21:55 +00006048
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006049 if (kind == PyUnicode_1BYTE_KIND) {
6050 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6051 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006052 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006053
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006054 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006055 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006056 }
6057 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006058 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006059 }
6060 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006061 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006062 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006063
6064 pos = 0;
6065 while (pos < len) {
6066 Py_ssize_t repsize, moreunits;
6067
6068 if (kind == PyUnicode_2BYTE_KIND) {
6069 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6070 &out, native_ordering);
6071 }
6072 else {
6073 assert(kind == PyUnicode_4BYTE_KIND);
6074 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6075 &out, native_ordering);
6076 }
6077 if (pos == len)
6078 break;
6079
6080 rep = unicode_encode_call_errorhandler(
6081 errors, &errorHandler,
6082 encoding, "surrogates not allowed",
6083 str, &exc, pos, pos + 1, &pos);
6084 if (!rep)
6085 goto error;
6086
6087 if (PyBytes_Check(rep)) {
6088 repsize = PyBytes_GET_SIZE(rep);
6089 if (repsize & 1) {
6090 raise_encode_exception(&exc, encoding,
6091 str, pos - 1, pos,
6092 "surrogates not allowed");
6093 goto error;
6094 }
6095 moreunits = repsize / 2;
6096 }
6097 else {
6098 assert(PyUnicode_Check(rep));
6099 if (PyUnicode_READY(rep) < 0)
6100 goto error;
6101 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6102 if (!PyUnicode_IS_ASCII(rep)) {
6103 raise_encode_exception(&exc, encoding,
6104 str, pos - 1, pos,
6105 "surrogates not allowed");
6106 goto error;
6107 }
6108 }
6109
6110 /* two bytes are reserved for each surrogate */
6111 if (moreunits > 1) {
6112 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006113 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006114 /* integer overflow */
6115 PyErr_NoMemory();
6116 goto error;
6117 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006118 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006119 goto error;
6120 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6121 }
6122
6123 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006124 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006125 out += moreunits;
6126 } else /* rep is unicode */ {
6127 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6128 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6129 &out, native_ordering);
6130 }
6131
6132 Py_CLEAR(rep);
6133 }
6134
6135 /* Cut back to size actually needed. This is necessary for, for example,
6136 encoding of a string containing isolated surrogates and the 'ignore' handler
6137 is used. */
6138 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6139 if (nsize != PyBytes_GET_SIZE(v))
6140 _PyBytes_Resize(&v, nsize);
6141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006143 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006144 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006145 error:
6146 Py_XDECREF(rep);
6147 Py_XDECREF(errorHandler);
6148 Py_XDECREF(exc);
6149 Py_XDECREF(v);
6150 return NULL;
6151#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152}
6153
Alexander Belopolsky40018472011-02-26 01:02:56 +00006154PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6156 Py_ssize_t size,
6157 const char *errors,
6158 int byteorder)
6159{
6160 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006161 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 if (tmp == NULL)
6163 return NULL;
6164 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6165 Py_DECREF(tmp);
6166 return result;
6167}
6168
6169PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006170PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173}
6174
6175/* --- Unicode Escape Codec ----------------------------------------------- */
6176
Fredrik Lundh06d12682001-01-24 07:59:11 +00006177static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006178
Alexander Belopolsky40018472011-02-26 01:02:56 +00006179PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006180_PyUnicode_DecodeUnicodeEscape(const char *s,
6181 Py_ssize_t size,
6182 const char *errors,
6183 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006186 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006188 PyObject *errorHandler = NULL;
6189 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006190
Eric V. Smith42454af2016-10-31 09:22:08 -04006191 // so we can remember if we've seen an invalid escape char or not
6192 *first_invalid_escape = NULL;
6193
Victor Stinner62ec3312016-09-06 17:04:34 -07006194 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006195 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006196 }
6197 /* Escaped strings will always be longer than the resulting
6198 Unicode string, so we start with size here and then reduce the
6199 length after conversion to the true value.
6200 (but if the error callback returns a long replacement string
6201 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006202 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006203 writer.min_length = size;
6204 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6205 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006206 }
6207
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 end = s + size;
6209 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006210 unsigned char c = (unsigned char) *s++;
6211 Py_UCS4 ch;
6212 int count;
6213 Py_ssize_t startinpos;
6214 Py_ssize_t endinpos;
6215 const char *message;
6216
6217#define WRITE_ASCII_CHAR(ch) \
6218 do { \
6219 assert(ch <= 127); \
6220 assert(writer.pos < writer.size); \
6221 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6222 } while(0)
6223
6224#define WRITE_CHAR(ch) \
6225 do { \
6226 if (ch <= writer.maxchar) { \
6227 assert(writer.pos < writer.size); \
6228 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6229 } \
6230 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6231 goto onError; \
6232 } \
6233 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234
6235 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 if (c != '\\') {
6237 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 continue;
6239 }
6240
Victor Stinner62ec3312016-09-06 17:04:34 -07006241 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006243 if (s >= end) {
6244 message = "\\ at end of string";
6245 goto error;
6246 }
6247 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006248
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006250 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006253 case '\n': continue;
6254 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6255 case '\'': WRITE_ASCII_CHAR('\''); continue;
6256 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6257 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006258 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006259 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6260 case 't': WRITE_ASCII_CHAR('\t'); continue;
6261 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6262 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006263 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006264 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006265 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006266 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 case '0': case '1': case '2': case '3':
6270 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006271 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006272 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006273 ch = (ch<<3) + *s++ - '0';
6274 if (s < end && '0' <= *s && *s <= '7') {
6275 ch = (ch<<3) + *s++ - '0';
6276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006278 WRITE_CHAR(ch);
6279 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 /* hex escapes */
6282 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006284 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006285 message = "truncated \\xXX escape";
6286 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006290 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006291 message = "truncated \\uXXXX escape";
6292 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006295 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006297 message = "truncated \\UXXXXXXXX escape";
6298 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006299 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006300 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 ch <<= 4;
6302 if (c >= '0' && c <= '9') {
6303 ch += c - '0';
6304 }
6305 else if (c >= 'a' && c <= 'f') {
6306 ch += c - ('a' - 10);
6307 }
6308 else if (c >= 'A' && c <= 'F') {
6309 ch += c - ('A' - 10);
6310 }
6311 else {
6312 break;
6313 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006314 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006315 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006316 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 }
6318
6319 /* when we get here, ch is a 32-bit unicode character */
6320 if (ch > MAX_UNICODE) {
6321 message = "illegal Unicode character";
6322 goto error;
6323 }
6324
6325 WRITE_CHAR(ch);
6326 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006327
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006329 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006330 if (ucnhash_CAPI == NULL) {
6331 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006332 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6333 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006334 if (ucnhash_CAPI == NULL) {
6335 PyErr_SetString(
6336 PyExc_UnicodeError,
6337 "\\N escapes not supported (can't load unicodedata module)"
6338 );
6339 goto onError;
6340 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006341 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006342
6343 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006344 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 const char *start = ++s;
6346 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006347 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006348 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006349 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 namelen = s - start;
6351 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006352 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006353 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006354 ch = 0xffffffff; /* in case 'getcode' messes up */
6355 if (namelen <= INT_MAX &&
6356 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6357 &ch, 0)) {
6358 assert(ch <= MAX_UNICODE);
6359 WRITE_CHAR(ch);
6360 continue;
6361 }
6362 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006363 }
6364 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006365 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006366
6367 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006368 if (*first_invalid_escape == NULL) {
6369 *first_invalid_escape = s-1; /* Back up one char, since we've
6370 already incremented s. */
6371 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006372 WRITE_ASCII_CHAR('\\');
6373 WRITE_CHAR(c);
6374 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006376
6377 error:
6378 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006379 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006380 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006381 errors, &errorHandler,
6382 "unicodeescape", message,
6383 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006384 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006385 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006386 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006387 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006388
6389#undef WRITE_ASCII_CHAR
6390#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006392
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006393 Py_XDECREF(errorHandler);
6394 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006395 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006396
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006398 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 Py_XDECREF(errorHandler);
6400 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 return NULL;
6402}
6403
Eric V. Smith42454af2016-10-31 09:22:08 -04006404PyObject *
6405PyUnicode_DecodeUnicodeEscape(const char *s,
6406 Py_ssize_t size,
6407 const char *errors)
6408{
6409 const char *first_invalid_escape;
6410 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6411 &first_invalid_escape);
6412 if (result == NULL)
6413 return NULL;
6414 if (first_invalid_escape != NULL) {
6415 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6416 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006417 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006418 Py_DECREF(result);
6419 return NULL;
6420 }
6421 }
6422 return result;
6423}
6424
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006425/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426
Alexander Belopolsky40018472011-02-26 01:02:56 +00006427PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006428PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006430 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006431 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006434 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006435 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436
Ezio Melottie7f90372012-10-05 03:33:31 +03006437 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006438 escape.
6439
Ezio Melottie7f90372012-10-05 03:33:31 +03006440 For UCS1 strings it's '\xxx', 4 bytes per source character.
6441 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6442 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006443 */
6444
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006445 if (!PyUnicode_Check(unicode)) {
6446 PyErr_BadArgument();
6447 return NULL;
6448 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006449 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006450 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 }
Victor Stinner358af132015-10-12 22:36:57 +02006452
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006453 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006454 if (len == 0) {
6455 return PyBytes_FromStringAndSize(NULL, 0);
6456 }
6457
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006458 kind = PyUnicode_KIND(unicode);
6459 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006460 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6461 bytes, and 1 byte characters 4. */
6462 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006463 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 return PyErr_NoMemory();
6465 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006466 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 if (repr == NULL) {
6468 return NULL;
6469 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006470
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006472 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006473 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006474
Victor Stinner62ec3312016-09-06 17:04:34 -07006475 /* U+0000-U+00ff range */
6476 if (ch < 0x100) {
6477 if (ch >= ' ' && ch < 127) {
6478 if (ch != '\\') {
6479 /* Copy printable US ASCII as-is */
6480 *p++ = (char) ch;
6481 }
6482 /* Escape backslashes */
6483 else {
6484 *p++ = '\\';
6485 *p++ = '\\';
6486 }
6487 }
Victor Stinner358af132015-10-12 22:36:57 +02006488
Victor Stinner62ec3312016-09-06 17:04:34 -07006489 /* Map special whitespace to '\t', \n', '\r' */
6490 else if (ch == '\t') {
6491 *p++ = '\\';
6492 *p++ = 't';
6493 }
6494 else if (ch == '\n') {
6495 *p++ = '\\';
6496 *p++ = 'n';
6497 }
6498 else if (ch == '\r') {
6499 *p++ = '\\';
6500 *p++ = 'r';
6501 }
6502
6503 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6504 else {
6505 *p++ = '\\';
6506 *p++ = 'x';
6507 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6508 *p++ = Py_hexdigits[ch & 0x000F];
6509 }
Tim Petersced69f82003-09-16 20:30:58 +00006510 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006511 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006512 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 *p++ = '\\';
6514 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006515 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6516 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6517 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6518 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006520 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6521 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006522
Victor Stinner62ec3312016-09-06 17:04:34 -07006523 /* Make sure that the first two digits are zero */
6524 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006525 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006526 *p++ = 'U';
6527 *p++ = '0';
6528 *p++ = '0';
6529 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6530 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6531 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6532 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6533 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6534 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537
Victor Stinner62ec3312016-09-06 17:04:34 -07006538 assert(p - PyBytes_AS_STRING(repr) > 0);
6539 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6540 return NULL;
6541 }
6542 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543}
6544
Alexander Belopolsky40018472011-02-26 01:02:56 +00006545PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006546PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6547 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006549 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006550 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006551 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006553 }
6554
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006555 result = PyUnicode_AsUnicodeEscapeString(tmp);
6556 Py_DECREF(tmp);
6557 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558}
6559
6560/* --- Raw Unicode Escape Codec ------------------------------------------- */
6561
Alexander Belopolsky40018472011-02-26 01:02:56 +00006562PyObject *
6563PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006564 Py_ssize_t size,
6565 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006567 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006568 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006570 PyObject *errorHandler = NULL;
6571 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006572
Victor Stinner62ec3312016-09-06 17:04:34 -07006573 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006574 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006575 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006576
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 /* Escaped strings will always be longer than the resulting
6578 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006579 length after conversion to the true value. (But decoding error
6580 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006581 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006582 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006583 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6584 goto onError;
6585 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006586
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 end = s + size;
6588 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006589 unsigned char c = (unsigned char) *s++;
6590 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006591 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006592 Py_ssize_t startinpos;
6593 Py_ssize_t endinpos;
6594 const char *message;
6595
6596#define WRITE_CHAR(ch) \
6597 do { \
6598 if (ch <= writer.maxchar) { \
6599 assert(writer.pos < writer.size); \
6600 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6601 } \
6602 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6603 goto onError; \
6604 } \
6605 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006608 if (c != '\\' || s >= end) {
6609 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006611 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006612
Victor Stinner62ec3312016-09-06 17:04:34 -07006613 c = (unsigned char) *s++;
6614 if (c == 'u') {
6615 count = 4;
6616 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006618 else if (c == 'U') {
6619 count = 8;
6620 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006621 }
6622 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006623 assert(writer.pos < writer.size);
6624 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6625 WRITE_CHAR(c);
6626 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006627 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006628 startinpos = s - starts - 2;
6629
6630 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6631 for (ch = 0; count && s < end; ++s, --count) {
6632 c = (unsigned char)*s;
6633 ch <<= 4;
6634 if (c >= '0' && c <= '9') {
6635 ch += c - '0';
6636 }
6637 else if (c >= 'a' && c <= 'f') {
6638 ch += c - ('a' - 10);
6639 }
6640 else if (c >= 'A' && c <= 'F') {
6641 ch += c - ('A' - 10);
6642 }
6643 else {
6644 break;
6645 }
6646 }
6647 if (!count) {
6648 if (ch <= MAX_UNICODE) {
6649 WRITE_CHAR(ch);
6650 continue;
6651 }
6652 message = "\\Uxxxxxxxx out of range";
6653 }
6654
6655 endinpos = s-starts;
6656 writer.min_length = end - s + writer.pos;
6657 if (unicode_decode_call_errorhandler_writer(
6658 errors, &errorHandler,
6659 "rawunicodeescape", message,
6660 &starts, &end, &startinpos, &endinpos, &exc, &s,
6661 &writer)) {
6662 goto onError;
6663 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006664 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006665
6666#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 Py_XDECREF(errorHandler);
6669 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006670 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006671
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006673 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674 Py_XDECREF(errorHandler);
6675 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006677
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678}
6679
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006680
Alexander Belopolsky40018472011-02-26 01:02:56 +00006681PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006682PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683{
Victor Stinner62ec3312016-09-06 17:04:34 -07006684 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006686 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006687 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006688 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006689 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006691 if (!PyUnicode_Check(unicode)) {
6692 PyErr_BadArgument();
6693 return NULL;
6694 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006695 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006696 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006697 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006698 kind = PyUnicode_KIND(unicode);
6699 data = PyUnicode_DATA(unicode);
6700 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006701 if (kind == PyUnicode_1BYTE_KIND) {
6702 return PyBytes_FromStringAndSize(data, len);
6703 }
Victor Stinner0e368262011-11-10 20:12:49 +01006704
Victor Stinner62ec3312016-09-06 17:04:34 -07006705 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6706 bytes, and 1 byte characters 4. */
6707 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006708
Victor Stinner62ec3312016-09-06 17:04:34 -07006709 if (len > PY_SSIZE_T_MAX / expandsize) {
6710 return PyErr_NoMemory();
6711 }
6712 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6713 if (repr == NULL) {
6714 return NULL;
6715 }
6716 if (len == 0) {
6717 return repr;
6718 }
6719
6720 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006721 for (pos = 0; pos < len; pos++) {
6722 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006723
Victor Stinner62ec3312016-09-06 17:04:34 -07006724 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6725 if (ch < 0x100) {
6726 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006727 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006728 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006729 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 *p++ = '\\';
6731 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006732 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6733 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6734 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6735 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006737 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6738 else {
6739 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6740 *p++ = '\\';
6741 *p++ = 'U';
6742 *p++ = '0';
6743 *p++ = '0';
6744 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6745 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6746 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6747 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6748 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6749 *p++ = Py_hexdigits[ch & 15];
6750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006752
Victor Stinner62ec3312016-09-06 17:04:34 -07006753 assert(p > PyBytes_AS_STRING(repr));
6754 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6755 return NULL;
6756 }
6757 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758}
6759
Alexander Belopolsky40018472011-02-26 01:02:56 +00006760PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006761PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6762 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006764 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006765 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006766 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006767 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006768 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6769 Py_DECREF(tmp);
6770 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771}
6772
6773/* --- Latin-1 Codec ------------------------------------------------------ */
6774
Alexander Belopolsky40018472011-02-26 01:02:56 +00006775PyObject *
6776PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006777 Py_ssize_t size,
6778 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006781 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782}
6783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006784/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006785static void
6786make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006787 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006788 PyObject *unicode,
6789 Py_ssize_t startpos, Py_ssize_t endpos,
6790 const char *reason)
6791{
6792 if (*exceptionObject == NULL) {
6793 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006795 encoding, unicode, startpos, endpos, reason);
6796 }
6797 else {
6798 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6799 goto onError;
6800 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6801 goto onError;
6802 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6803 goto onError;
6804 return;
6805 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006806 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006807 }
6808}
6809
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006810/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006811static void
6812raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006813 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006814 PyObject *unicode,
6815 Py_ssize_t startpos, Py_ssize_t endpos,
6816 const char *reason)
6817{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006818 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006819 encoding, unicode, startpos, endpos, reason);
6820 if (*exceptionObject != NULL)
6821 PyCodec_StrictErrors(*exceptionObject);
6822}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006823
6824/* error handling callback helper:
6825 build arguments, call the callback and check the arguments,
6826 put the result into newpos and return the replacement string, which
6827 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006828static PyObject *
6829unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006830 PyObject **errorHandler,
6831 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006832 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006833 Py_ssize_t startpos, Py_ssize_t endpos,
6834 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006835{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006836 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006837 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006838 PyObject *restuple;
6839 PyObject *resunicode;
6840
6841 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006843 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006845 }
6846
Benjamin Petersonbac79492012-01-14 13:34:47 -05006847 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006848 return NULL;
6849 len = PyUnicode_GET_LENGTH(unicode);
6850
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006851 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006852 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006853 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006855
Petr Viktorinffd97532020-02-11 17:46:57 +01006856 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006860 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 Py_DECREF(restuple);
6862 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006863 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006864 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 &resunicode, newpos)) {
6866 Py_DECREF(restuple);
6867 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006869 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6870 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6871 Py_DECREF(restuple);
6872 return NULL;
6873 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006874 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006875 *newpos = len + *newpos;
6876 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006877 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 Py_DECREF(restuple);
6879 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006880 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006881 Py_INCREF(resunicode);
6882 Py_DECREF(restuple);
6883 return resunicode;
6884}
6885
Alexander Belopolsky40018472011-02-26 01:02:56 +00006886static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006887unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006888 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006889 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006891 /* input state */
6892 Py_ssize_t pos=0, size;
6893 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006894 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006895 /* pointer into the output */
6896 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006897 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6898 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006899 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006900 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006901 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006902 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006903 /* output object */
6904 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905
Benjamin Petersonbac79492012-01-14 13:34:47 -05006906 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006907 return NULL;
6908 size = PyUnicode_GET_LENGTH(unicode);
6909 kind = PyUnicode_KIND(unicode);
6910 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006911 /* allocate enough for a simple encoding without
6912 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006913 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006914 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006915
6916 _PyBytesWriter_Init(&writer);
6917 str = _PyBytesWriter_Alloc(&writer, size);
6918 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006919 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006921 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006922 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006925 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006927 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006928 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006931 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006933 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006934 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006935 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006936
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006937 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006939
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006940 /* Only overallocate the buffer if it's not the last write */
6941 writer.overallocate = (collend < size);
6942
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006944 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006945 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006946
6947 switch (error_handler) {
6948 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006949 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006951
6952 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006953 memset(str, '?', collend - collstart);
6954 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006955 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006956 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006957 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 break;
Victor Stinner50149202015-09-22 00:26:54 +02006959
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006960 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006961 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006962 writer.min_size -= (collend - collstart);
6963 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006964 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006965 if (str == NULL)
6966 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006967 pos = collend;
6968 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006969
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006970 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006971 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006972 writer.min_size -= (collend - collstart);
6973 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006974 unicode, collstart, collend);
6975 if (str == NULL)
6976 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006977 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 break;
Victor Stinner50149202015-09-22 00:26:54 +02006979
Victor Stinnerc3713e92015-09-29 12:32:13 +02006980 case _Py_ERROR_SURROGATEESCAPE:
6981 for (i = collstart; i < collend; ++i) {
6982 ch = PyUnicode_READ(kind, data, i);
6983 if (ch < 0xdc80 || 0xdcff < ch) {
6984 /* Not a UTF-8b surrogate */
6985 break;
6986 }
6987 *str++ = (char)(ch - 0xdc00);
6988 ++pos;
6989 }
6990 if (i >= collend)
6991 break;
6992 collstart = pos;
6993 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006994 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006995
Benjamin Peterson29060642009-01-31 22:14:21 +00006996 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006997 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6998 encoding, reason, unicode, &exc,
6999 collstart, collend, &newpos);
7000 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007001 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007002
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007003 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007004 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007005
Victor Stinner6bd525b2015-10-09 13:10:05 +02007006 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007007 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007008 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007009 PyBytes_AS_STRING(rep),
7010 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007011 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007012 else {
7013 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007014
Victor Stinner6bd525b2015-10-09 13:10:05 +02007015 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007017
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007018 if (limit == 256 ?
7019 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7020 !PyUnicode_IS_ASCII(rep))
7021 {
7022 /* Not all characters are smaller than limit */
7023 raise_encode_exception(&exc, encoding, unicode,
7024 collstart, collend, reason);
7025 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007027 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7028 str = _PyBytesWriter_WriteBytes(&writer, str,
7029 PyUnicode_DATA(rep),
7030 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007032 if (str == NULL)
7033 goto onError;
7034
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007035 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007036 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007037 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007038
7039 /* If overallocation was disabled, ensure that it was the last
7040 write. Otherwise, we missed an optimization */
7041 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007042 }
7043 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007044
Victor Stinner50149202015-09-22 00:26:54 +02007045 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007046 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007047 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007048
7049 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007050 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007051 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007052 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007053 Py_XDECREF(exc);
7054 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007055}
7056
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007057/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007058PyObject *
7059PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007060 Py_ssize_t size,
7061 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007063 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007064 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007065 if (unicode == NULL)
7066 return NULL;
7067 result = unicode_encode_ucs1(unicode, errors, 256);
7068 Py_DECREF(unicode);
7069 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070}
7071
Alexander Belopolsky40018472011-02-26 01:02:56 +00007072PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007073_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074{
7075 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007076 PyErr_BadArgument();
7077 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007079 if (PyUnicode_READY(unicode) == -1)
7080 return NULL;
7081 /* Fast path: if it is a one-byte string, construct
7082 bytes object directly. */
7083 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7084 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7085 PyUnicode_GET_LENGTH(unicode));
7086 /* Non-Latin-1 characters present. Defer to above function to
7087 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007088 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007089}
7090
7091PyObject*
7092PyUnicode_AsLatin1String(PyObject *unicode)
7093{
7094 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095}
7096
7097/* --- 7-bit ASCII Codec -------------------------------------------------- */
7098
Alexander Belopolsky40018472011-02-26 01:02:56 +00007099PyObject *
7100PyUnicode_DecodeASCII(const char *s,
7101 Py_ssize_t size,
7102 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007104 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007105 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007106 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007107 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007108 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007109
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007111 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007112
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007114 if (size == 1 && (unsigned char)s[0] < 128)
7115 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007116
Inada Naoki770847a2019-06-24 12:30:24 +09007117 // Shortcut for simple case
7118 PyObject *u = PyUnicode_New(size, 127);
7119 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007120 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007121 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007122 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007123 if (outpos == size) {
7124 return u;
7125 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007126
Inada Naoki770847a2019-06-24 12:30:24 +09007127 _PyUnicodeWriter writer;
7128 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007129 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007130
Inada Naoki770847a2019-06-24 12:30:24 +09007131 s += outpos;
7132 int kind = writer.kind;
7133 void *data = writer.data;
7134 Py_ssize_t startinpos, endinpos;
7135
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007136 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007137 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007139 PyUnicode_WRITE(kind, data, writer.pos, c);
7140 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007142 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007143 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007144
7145 /* byte outsize range 0x00..0x7f: call the error handler */
7146
7147 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007148 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007149
7150 switch (error_handler)
7151 {
7152 case _Py_ERROR_REPLACE:
7153 case _Py_ERROR_SURROGATEESCAPE:
7154 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007155 but we may switch to UCS2 at the first write */
7156 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7157 goto onError;
7158 kind = writer.kind;
7159 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007160
7161 if (error_handler == _Py_ERROR_REPLACE)
7162 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7163 else
7164 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7165 writer.pos++;
7166 ++s;
7167 break;
7168
7169 case _Py_ERROR_IGNORE:
7170 ++s;
7171 break;
7172
7173 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 startinpos = s-starts;
7175 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007176 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007177 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 "ascii", "ordinal not in range(128)",
7179 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007180 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007182 kind = writer.kind;
7183 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007186 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007187 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007188 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007189
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007191 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007192 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007193 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194 return NULL;
7195}
7196
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007197/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007198PyObject *
7199PyUnicode_EncodeASCII(const Py_UNICODE *p,
7200 Py_ssize_t size,
7201 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007203 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007204 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007205 if (unicode == NULL)
7206 return NULL;
7207 result = unicode_encode_ucs1(unicode, errors, 128);
7208 Py_DECREF(unicode);
7209 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210}
7211
Alexander Belopolsky40018472011-02-26 01:02:56 +00007212PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007213_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214{
7215 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 PyErr_BadArgument();
7217 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007219 if (PyUnicode_READY(unicode) == -1)
7220 return NULL;
7221 /* Fast path: if it is an ASCII-only string, construct bytes object
7222 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007223 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007224 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7225 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007226 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007227}
7228
7229PyObject *
7230PyUnicode_AsASCIIString(PyObject *unicode)
7231{
7232 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233}
7234
Steve Dowercc16be82016-09-08 10:35:16 -07007235#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007236
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007237/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007238
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007239#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007240#define NEED_RETRY
7241#endif
7242
Steve Dower7ebdda02019-08-21 16:22:33 -07007243/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7244 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7245 both cases also and avoids partial characters overrunning the
7246 length limit in MultiByteToWideChar on Windows */
7247#define DECODING_CHUNK_SIZE (INT_MAX/4)
7248
Victor Stinner3a50e702011-10-18 21:21:00 +02007249#ifndef WC_ERR_INVALID_CHARS
7250# define WC_ERR_INVALID_CHARS 0x0080
7251#endif
7252
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007253static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007254code_page_name(UINT code_page, PyObject **obj)
7255{
7256 *obj = NULL;
7257 if (code_page == CP_ACP)
7258 return "mbcs";
7259 if (code_page == CP_UTF7)
7260 return "CP_UTF7";
7261 if (code_page == CP_UTF8)
7262 return "CP_UTF8";
7263
7264 *obj = PyBytes_FromFormat("cp%u", code_page);
7265 if (*obj == NULL)
7266 return NULL;
7267 return PyBytes_AS_STRING(*obj);
7268}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007269
Victor Stinner3a50e702011-10-18 21:21:00 +02007270static DWORD
7271decode_code_page_flags(UINT code_page)
7272{
7273 if (code_page == CP_UTF7) {
7274 /* The CP_UTF7 decoder only supports flags=0 */
7275 return 0;
7276 }
7277 else
7278 return MB_ERR_INVALID_CHARS;
7279}
7280
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007281/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007282 * Decode a byte string from a Windows code page into unicode object in strict
7283 * mode.
7284 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007285 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7286 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007288static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007289decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007290 wchar_t **buf,
7291 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007292 const char *in,
7293 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007294{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007295 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007296 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007297 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007298
7299 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007301 while ((outsize = MultiByteToWideChar(code_page, flags,
7302 in, insize, NULL, 0)) <= 0)
7303 {
7304 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7305 goto error;
7306 }
7307 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7308 flags = 0;
7309 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007310
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007311 /* Extend a wchar_t* buffer */
7312 Py_ssize_t n = *bufsize; /* Get the current length */
7313 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7314 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007316 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007317
7318 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007319 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7320 if (outsize <= 0)
7321 goto error;
7322 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007323
Victor Stinner3a50e702011-10-18 21:21:00 +02007324error:
7325 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7326 return -2;
7327 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007328 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007329}
7330
Victor Stinner3a50e702011-10-18 21:21:00 +02007331/*
7332 * Decode a byte string from a code page into unicode object with an error
7333 * handler.
7334 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007335 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007336 * UnicodeDecodeError exception and returns -1 on error.
7337 */
7338static int
7339decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007340 wchar_t **buf,
7341 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007342 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007343 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007344{
7345 const char *startin = in;
7346 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007347 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 /* Ideally, we should get reason from FormatMessage. This is the Windows
7349 2000 English version of the message. */
7350 const char *reason = "No mapping for the Unicode character exists "
7351 "in the target code page.";
7352 /* each step cannot decode more than 1 character, but a character can be
7353 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007354 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007355 int insize;
7356 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 PyObject *errorHandler = NULL;
7358 PyObject *exc = NULL;
7359 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007360 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007361 DWORD err;
7362 int ret = -1;
7363
7364 assert(size > 0);
7365
7366 encoding = code_page_name(code_page, &encoding_obj);
7367 if (encoding == NULL)
7368 return -1;
7369
Victor Stinner7d00cc12014-03-17 23:08:06 +01007370 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7372 UnicodeDecodeError. */
7373 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7374 if (exc != NULL) {
7375 PyCodec_StrictErrors(exc);
7376 Py_CLEAR(exc);
7377 }
7378 goto error;
7379 }
7380
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007381 /* Extend a wchar_t* buffer */
7382 Py_ssize_t n = *bufsize; /* Get the current length */
7383 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7384 PyErr_NoMemory();
7385 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007386 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007387 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7388 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007390 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007391
7392 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 while (in < endin)
7394 {
7395 /* Decode a character */
7396 insize = 1;
7397 do
7398 {
7399 outsize = MultiByteToWideChar(code_page, flags,
7400 in, insize,
7401 buffer, Py_ARRAY_LENGTH(buffer));
7402 if (outsize > 0)
7403 break;
7404 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007405 if (err == ERROR_INVALID_FLAGS && flags) {
7406 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7407 flags = 0;
7408 continue;
7409 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 if (err != ERROR_NO_UNICODE_TRANSLATION
7411 && err != ERROR_INSUFFICIENT_BUFFER)
7412 {
7413 PyErr_SetFromWindowsErr(0);
7414 goto error;
7415 }
7416 insize++;
7417 }
7418 /* 4=maximum length of a UTF-8 sequence */
7419 while (insize <= 4 && (in + insize) <= endin);
7420
7421 if (outsize <= 0) {
7422 Py_ssize_t startinpos, endinpos, outpos;
7423
Victor Stinner7d00cc12014-03-17 23:08:06 +01007424 /* last character in partial decode? */
7425 if (in + insize >= endin && !final)
7426 break;
7427
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 startinpos = in - startin;
7429 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007430 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007431 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 errors, &errorHandler,
7433 encoding, reason,
7434 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007435 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 {
7437 goto error;
7438 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007439 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 }
7441 else {
7442 in += insize;
7443 memcpy(out, buffer, outsize * sizeof(wchar_t));
7444 out += outsize;
7445 }
7446 }
7447
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007448 /* Shrink the buffer */
7449 assert(out - *buf <= *bufsize);
7450 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007451 /* (in - startin) <= size and size is an int */
7452 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007453
7454error:
7455 Py_XDECREF(encoding_obj);
7456 Py_XDECREF(errorHandler);
7457 Py_XDECREF(exc);
7458 return ret;
7459}
7460
Victor Stinner3a50e702011-10-18 21:21:00 +02007461static PyObject *
7462decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007463 const char *s, Py_ssize_t size,
7464 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007465{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007466 wchar_t *buf = NULL;
7467 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007468 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007469
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 if (code_page < 0) {
7471 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7472 return NULL;
7473 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007474 if (size < 0) {
7475 PyErr_BadInternalCall();
7476 return NULL;
7477 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007478
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007481
Victor Stinner76a31a62011-11-04 00:05:13 +01007482 do
7483 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007484#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007485 if (size > DECODING_CHUNK_SIZE) {
7486 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007487 final = 0;
7488 done = 0;
7489 }
7490 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007491#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007492 {
7493 chunk_size = (int)size;
7494 final = (consumed == NULL);
7495 done = 1;
7496 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007497
Victor Stinner76a31a62011-11-04 00:05:13 +01007498 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007499 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007500 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007501 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007502 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007503
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007504 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007505 s, chunk_size);
7506 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007507 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007508 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007509 errors, final);
7510 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007511
7512 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007513 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007514 return NULL;
7515 }
7516
7517 if (consumed)
7518 *consumed += converted;
7519
7520 s += converted;
7521 size -= converted;
7522 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007523
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007524 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7525 PyMem_Free(buf);
7526 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007527}
7528
Alexander Belopolsky40018472011-02-26 01:02:56 +00007529PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007530PyUnicode_DecodeCodePageStateful(int code_page,
7531 const char *s,
7532 Py_ssize_t size,
7533 const char *errors,
7534 Py_ssize_t *consumed)
7535{
7536 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7537}
7538
7539PyObject *
7540PyUnicode_DecodeMBCSStateful(const char *s,
7541 Py_ssize_t size,
7542 const char *errors,
7543 Py_ssize_t *consumed)
7544{
7545 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7546}
7547
7548PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007549PyUnicode_DecodeMBCS(const char *s,
7550 Py_ssize_t size,
7551 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007552{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007553 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7554}
7555
Victor Stinner3a50e702011-10-18 21:21:00 +02007556static DWORD
7557encode_code_page_flags(UINT code_page, const char *errors)
7558{
7559 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007560 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007561 }
7562 else if (code_page == CP_UTF7) {
7563 /* CP_UTF7 only supports flags=0 */
7564 return 0;
7565 }
7566 else {
7567 if (errors != NULL && strcmp(errors, "replace") == 0)
7568 return 0;
7569 else
7570 return WC_NO_BEST_FIT_CHARS;
7571 }
7572}
7573
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007574/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007575 * Encode a Unicode string to a Windows code page into a byte string in strict
7576 * mode.
7577 *
7578 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007579 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007580 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007581static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007582encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007583 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007585{
Victor Stinner554f3f02010-06-16 23:33:54 +00007586 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 BOOL *pusedDefaultChar = &usedDefaultChar;
7588 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007589 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007590 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 const DWORD flags = encode_code_page_flags(code_page, NULL);
7592 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007593 /* Create a substring so that we can get the UTF-16 representation
7594 of just the slice under consideration. */
7595 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007596
Martin v. Löwis3d325192011-11-04 18:23:06 +01007597 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007598
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007600 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007602 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007603
Victor Stinner2fc507f2011-11-04 20:06:39 +01007604 substring = PyUnicode_Substring(unicode, offset, offset+len);
7605 if (substring == NULL)
7606 return -1;
7607 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7608 if (p == NULL) {
7609 Py_DECREF(substring);
7610 return -1;
7611 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007612 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007613
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007614 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007615 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007616 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 NULL, 0,
7618 NULL, pusedDefaultChar);
7619 if (outsize <= 0)
7620 goto error;
7621 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007622 if (pusedDefaultChar && *pusedDefaultChar) {
7623 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007625 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007626
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007629 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007630 if (*outbytes == NULL) {
7631 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007633 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007635 }
7636 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007638 const Py_ssize_t n = PyBytes_Size(*outbytes);
7639 if (outsize > PY_SSIZE_T_MAX - n) {
7640 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007641 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007643 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007644 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7645 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007647 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007648 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007649 }
7650
7651 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007653 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007654 out, outsize,
7655 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007656 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007657 if (outsize <= 0)
7658 goto error;
7659 if (pusedDefaultChar && *pusedDefaultChar)
7660 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007661 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007662
Victor Stinner3a50e702011-10-18 21:21:00 +02007663error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007664 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007665 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7666 return -2;
7667 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007668 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007669}
7670
Victor Stinner3a50e702011-10-18 21:21:00 +02007671/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007672 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 * error handler.
7674 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007675 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007676 * -1 on other error.
7677 */
7678static int
7679encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007680 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007681 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007682{
Victor Stinner3a50e702011-10-18 21:21:00 +02007683 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007684 Py_ssize_t pos = unicode_offset;
7685 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007686 /* Ideally, we should get reason from FormatMessage. This is the Windows
7687 2000 English version of the message. */
7688 const char *reason = "invalid character";
7689 /* 4=maximum length of a UTF-8 sequence */
7690 char buffer[4];
7691 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7692 Py_ssize_t outsize;
7693 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 PyObject *errorHandler = NULL;
7695 PyObject *exc = NULL;
7696 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007697 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007698 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007699 PyObject *rep;
7700 int ret = -1;
7701
7702 assert(insize > 0);
7703
7704 encoding = code_page_name(code_page, &encoding_obj);
7705 if (encoding == NULL)
7706 return -1;
7707
7708 if (errors == NULL || strcmp(errors, "strict") == 0) {
7709 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7710 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007711 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007712 if (exc != NULL) {
7713 PyCodec_StrictErrors(exc);
7714 Py_DECREF(exc);
7715 }
7716 Py_XDECREF(encoding_obj);
7717 return -1;
7718 }
7719
7720 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7721 pusedDefaultChar = &usedDefaultChar;
7722 else
7723 pusedDefaultChar = NULL;
7724
7725 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7726 PyErr_NoMemory();
7727 goto error;
7728 }
7729 outsize = insize * Py_ARRAY_LENGTH(buffer);
7730
7731 if (*outbytes == NULL) {
7732 /* Create string object */
7733 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7734 if (*outbytes == NULL)
7735 goto error;
7736 out = PyBytes_AS_STRING(*outbytes);
7737 }
7738 else {
7739 /* Extend string object */
7740 Py_ssize_t n = PyBytes_Size(*outbytes);
7741 if (n > PY_SSIZE_T_MAX - outsize) {
7742 PyErr_NoMemory();
7743 goto error;
7744 }
7745 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7746 goto error;
7747 out = PyBytes_AS_STRING(*outbytes) + n;
7748 }
7749
7750 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007751 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007752 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007753 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7754 wchar_t chars[2];
7755 int charsize;
7756 if (ch < 0x10000) {
7757 chars[0] = (wchar_t)ch;
7758 charsize = 1;
7759 }
7760 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007761 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7762 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007763 charsize = 2;
7764 }
7765
Victor Stinner3a50e702011-10-18 21:21:00 +02007766 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007767 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007768 buffer, Py_ARRAY_LENGTH(buffer),
7769 NULL, pusedDefaultChar);
7770 if (outsize > 0) {
7771 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7772 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007773 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007774 memcpy(out, buffer, outsize);
7775 out += outsize;
7776 continue;
7777 }
7778 }
7779 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7780 PyErr_SetFromWindowsErr(0);
7781 goto error;
7782 }
7783
Victor Stinner3a50e702011-10-18 21:21:00 +02007784 rep = unicode_encode_call_errorhandler(
7785 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007786 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007787 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007788 if (rep == NULL)
7789 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007790 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007791
7792 if (PyBytes_Check(rep)) {
7793 outsize = PyBytes_GET_SIZE(rep);
7794 if (outsize != 1) {
7795 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7796 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7797 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7798 Py_DECREF(rep);
7799 goto error;
7800 }
7801 out = PyBytes_AS_STRING(*outbytes) + offset;
7802 }
7803 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7804 out += outsize;
7805 }
7806 else {
7807 Py_ssize_t i;
7808 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007809 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007810
Benjamin Petersonbac79492012-01-14 13:34:47 -05007811 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007812 Py_DECREF(rep);
7813 goto error;
7814 }
7815
7816 outsize = PyUnicode_GET_LENGTH(rep);
7817 if (outsize != 1) {
7818 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7819 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7820 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7821 Py_DECREF(rep);
7822 goto error;
7823 }
7824 out = PyBytes_AS_STRING(*outbytes) + offset;
7825 }
7826 kind = PyUnicode_KIND(rep);
7827 data = PyUnicode_DATA(rep);
7828 for (i=0; i < outsize; i++) {
7829 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7830 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007831 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007832 encoding, unicode,
7833 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007834 "unable to encode error handler result to ASCII");
7835 Py_DECREF(rep);
7836 goto error;
7837 }
7838 *out = (unsigned char)ch;
7839 out++;
7840 }
7841 }
7842 Py_DECREF(rep);
7843 }
7844 /* write a NUL byte */
7845 *out = 0;
7846 outsize = out - PyBytes_AS_STRING(*outbytes);
7847 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7848 if (_PyBytes_Resize(outbytes, outsize) < 0)
7849 goto error;
7850 ret = 0;
7851
7852error:
7853 Py_XDECREF(encoding_obj);
7854 Py_XDECREF(errorHandler);
7855 Py_XDECREF(exc);
7856 return ret;
7857}
7858
Victor Stinner3a50e702011-10-18 21:21:00 +02007859static PyObject *
7860encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007861 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007862 const char *errors)
7863{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007864 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007865 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007866 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007867 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007868
Victor Stinner29dacf22015-01-26 16:41:32 +01007869 if (!PyUnicode_Check(unicode)) {
7870 PyErr_BadArgument();
7871 return NULL;
7872 }
7873
Benjamin Petersonbac79492012-01-14 13:34:47 -05007874 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007875 return NULL;
7876 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007877
Victor Stinner3a50e702011-10-18 21:21:00 +02007878 if (code_page < 0) {
7879 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7880 return NULL;
7881 }
7882
Martin v. Löwis3d325192011-11-04 18:23:06 +01007883 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007884 return PyBytes_FromStringAndSize(NULL, 0);
7885
Victor Stinner7581cef2011-11-03 22:32:33 +01007886 offset = 0;
7887 do
7888 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007889#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007890 if (len > DECODING_CHUNK_SIZE) {
7891 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007892 done = 0;
7893 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007894 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007895#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007896 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007897 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007898 done = 1;
7899 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007900
Victor Stinner76a31a62011-11-04 00:05:13 +01007901 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007902 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007903 errors);
7904 if (ret == -2)
7905 ret = encode_code_page_errors(code_page, &outbytes,
7906 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007907 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007908 if (ret < 0) {
7909 Py_XDECREF(outbytes);
7910 return NULL;
7911 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007912
Victor Stinner7581cef2011-11-03 22:32:33 +01007913 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007914 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007915 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007916
Victor Stinner3a50e702011-10-18 21:21:00 +02007917 return outbytes;
7918}
7919
7920PyObject *
7921PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7922 Py_ssize_t size,
7923 const char *errors)
7924{
Victor Stinner7581cef2011-11-03 22:32:33 +01007925 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007926 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007927 if (unicode == NULL)
7928 return NULL;
7929 res = encode_code_page(CP_ACP, unicode, errors);
7930 Py_DECREF(unicode);
7931 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007932}
7933
7934PyObject *
7935PyUnicode_EncodeCodePage(int code_page,
7936 PyObject *unicode,
7937 const char *errors)
7938{
Victor Stinner7581cef2011-11-03 22:32:33 +01007939 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007940}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007941
Alexander Belopolsky40018472011-02-26 01:02:56 +00007942PyObject *
7943PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007944{
Victor Stinner7581cef2011-11-03 22:32:33 +01007945 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007946}
7947
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007948#undef NEED_RETRY
7949
Steve Dowercc16be82016-09-08 10:35:16 -07007950#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007951
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952/* --- Character Mapping Codec -------------------------------------------- */
7953
Victor Stinnerfb161b12013-04-18 01:44:27 +02007954static int
7955charmap_decode_string(const char *s,
7956 Py_ssize_t size,
7957 PyObject *mapping,
7958 const char *errors,
7959 _PyUnicodeWriter *writer)
7960{
7961 const char *starts = s;
7962 const char *e;
7963 Py_ssize_t startinpos, endinpos;
7964 PyObject *errorHandler = NULL, *exc = NULL;
7965 Py_ssize_t maplen;
7966 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007967 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007968 Py_UCS4 x;
7969 unsigned char ch;
7970
7971 if (PyUnicode_READY(mapping) == -1)
7972 return -1;
7973
7974 maplen = PyUnicode_GET_LENGTH(mapping);
7975 mapdata = PyUnicode_DATA(mapping);
7976 mapkind = PyUnicode_KIND(mapping);
7977
7978 e = s + size;
7979
7980 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7981 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7982 * is disabled in encoding aliases, latin1 is preferred because
7983 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007984 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007985 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7986 Py_UCS4 maxchar = writer->maxchar;
7987
7988 assert (writer->kind == PyUnicode_1BYTE_KIND);
7989 while (s < e) {
7990 ch = *s;
7991 x = mapdata_ucs1[ch];
7992 if (x > maxchar) {
7993 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7994 goto onError;
7995 maxchar = writer->maxchar;
7996 outdata = (Py_UCS1 *)writer->data;
7997 }
7998 outdata[writer->pos] = x;
7999 writer->pos++;
8000 ++s;
8001 }
8002 return 0;
8003 }
8004
8005 while (s < e) {
8006 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8007 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008008 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008009 if (outkind == PyUnicode_1BYTE_KIND) {
8010 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8011 Py_UCS4 maxchar = writer->maxchar;
8012 while (s < e) {
8013 ch = *s;
8014 x = mapdata_ucs2[ch];
8015 if (x > maxchar)
8016 goto Error;
8017 outdata[writer->pos] = x;
8018 writer->pos++;
8019 ++s;
8020 }
8021 break;
8022 }
8023 else if (outkind == PyUnicode_2BYTE_KIND) {
8024 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8025 while (s < e) {
8026 ch = *s;
8027 x = mapdata_ucs2[ch];
8028 if (x == 0xFFFE)
8029 goto Error;
8030 outdata[writer->pos] = x;
8031 writer->pos++;
8032 ++s;
8033 }
8034 break;
8035 }
8036 }
8037 ch = *s;
8038
8039 if (ch < maplen)
8040 x = PyUnicode_READ(mapkind, mapdata, ch);
8041 else
8042 x = 0xfffe; /* invalid value */
8043Error:
8044 if (x == 0xfffe)
8045 {
8046 /* undefined mapping */
8047 startinpos = s-starts;
8048 endinpos = startinpos+1;
8049 if (unicode_decode_call_errorhandler_writer(
8050 errors, &errorHandler,
8051 "charmap", "character maps to <undefined>",
8052 &starts, &e, &startinpos, &endinpos, &exc, &s,
8053 writer)) {
8054 goto onError;
8055 }
8056 continue;
8057 }
8058
8059 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8060 goto onError;
8061 ++s;
8062 }
8063 Py_XDECREF(errorHandler);
8064 Py_XDECREF(exc);
8065 return 0;
8066
8067onError:
8068 Py_XDECREF(errorHandler);
8069 Py_XDECREF(exc);
8070 return -1;
8071}
8072
8073static int
8074charmap_decode_mapping(const char *s,
8075 Py_ssize_t size,
8076 PyObject *mapping,
8077 const char *errors,
8078 _PyUnicodeWriter *writer)
8079{
8080 const char *starts = s;
8081 const char *e;
8082 Py_ssize_t startinpos, endinpos;
8083 PyObject *errorHandler = NULL, *exc = NULL;
8084 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008085 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008086
8087 e = s + size;
8088
8089 while (s < e) {
8090 ch = *s;
8091
8092 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8093 key = PyLong_FromLong((long)ch);
8094 if (key == NULL)
8095 goto onError;
8096
8097 item = PyObject_GetItem(mapping, key);
8098 Py_DECREF(key);
8099 if (item == NULL) {
8100 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8101 /* No mapping found means: mapping is undefined. */
8102 PyErr_Clear();
8103 goto Undefined;
8104 } else
8105 goto onError;
8106 }
8107
8108 /* Apply mapping */
8109 if (item == Py_None)
8110 goto Undefined;
8111 if (PyLong_Check(item)) {
8112 long value = PyLong_AS_LONG(item);
8113 if (value == 0xFFFE)
8114 goto Undefined;
8115 if (value < 0 || value > MAX_UNICODE) {
8116 PyErr_Format(PyExc_TypeError,
8117 "character mapping must be in range(0x%lx)",
8118 (unsigned long)MAX_UNICODE + 1);
8119 goto onError;
8120 }
8121
8122 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8123 goto onError;
8124 }
8125 else if (PyUnicode_Check(item)) {
8126 if (PyUnicode_READY(item) == -1)
8127 goto onError;
8128 if (PyUnicode_GET_LENGTH(item) == 1) {
8129 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8130 if (value == 0xFFFE)
8131 goto Undefined;
8132 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8133 goto onError;
8134 }
8135 else {
8136 writer->overallocate = 1;
8137 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8138 goto onError;
8139 }
8140 }
8141 else {
8142 /* wrong return value */
8143 PyErr_SetString(PyExc_TypeError,
8144 "character mapping must return integer, None or str");
8145 goto onError;
8146 }
8147 Py_CLEAR(item);
8148 ++s;
8149 continue;
8150
8151Undefined:
8152 /* undefined mapping */
8153 Py_CLEAR(item);
8154 startinpos = s-starts;
8155 endinpos = startinpos+1;
8156 if (unicode_decode_call_errorhandler_writer(
8157 errors, &errorHandler,
8158 "charmap", "character maps to <undefined>",
8159 &starts, &e, &startinpos, &endinpos, &exc, &s,
8160 writer)) {
8161 goto onError;
8162 }
8163 }
8164 Py_XDECREF(errorHandler);
8165 Py_XDECREF(exc);
8166 return 0;
8167
8168onError:
8169 Py_XDECREF(item);
8170 Py_XDECREF(errorHandler);
8171 Py_XDECREF(exc);
8172 return -1;
8173}
8174
Alexander Belopolsky40018472011-02-26 01:02:56 +00008175PyObject *
8176PyUnicode_DecodeCharmap(const char *s,
8177 Py_ssize_t size,
8178 PyObject *mapping,
8179 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008181 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008182
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183 /* Default to Latin-1 */
8184 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008188 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008189 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008190 writer.min_length = size;
8191 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008193
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008194 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008195 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8196 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008197 }
8198 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008199 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008202 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008203
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008205 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206 return NULL;
8207}
8208
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008209/* Charmap encoding: the lookup table */
8210
Alexander Belopolsky40018472011-02-26 01:02:56 +00008211struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 PyObject_HEAD
8213 unsigned char level1[32];
8214 int count2, count3;
8215 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008216};
8217
8218static PyObject*
8219encoding_map_size(PyObject *obj, PyObject* args)
8220{
8221 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008224}
8225
8226static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008227 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 PyDoc_STR("Return the size (in bytes) of this object") },
8229 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008230};
8231
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008232static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008233 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 "EncodingMap", /*tp_name*/
8235 sizeof(struct encoding_map), /*tp_basicsize*/
8236 0, /*tp_itemsize*/
8237 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008238 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008239 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 0, /*tp_getattr*/
8241 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008242 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 0, /*tp_repr*/
8244 0, /*tp_as_number*/
8245 0, /*tp_as_sequence*/
8246 0, /*tp_as_mapping*/
8247 0, /*tp_hash*/
8248 0, /*tp_call*/
8249 0, /*tp_str*/
8250 0, /*tp_getattro*/
8251 0, /*tp_setattro*/
8252 0, /*tp_as_buffer*/
8253 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8254 0, /*tp_doc*/
8255 0, /*tp_traverse*/
8256 0, /*tp_clear*/
8257 0, /*tp_richcompare*/
8258 0, /*tp_weaklistoffset*/
8259 0, /*tp_iter*/
8260 0, /*tp_iternext*/
8261 encoding_map_methods, /*tp_methods*/
8262 0, /*tp_members*/
8263 0, /*tp_getset*/
8264 0, /*tp_base*/
8265 0, /*tp_dict*/
8266 0, /*tp_descr_get*/
8267 0, /*tp_descr_set*/
8268 0, /*tp_dictoffset*/
8269 0, /*tp_init*/
8270 0, /*tp_alloc*/
8271 0, /*tp_new*/
8272 0, /*tp_free*/
8273 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008274};
8275
8276PyObject*
8277PyUnicode_BuildEncodingMap(PyObject* string)
8278{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279 PyObject *result;
8280 struct encoding_map *mresult;
8281 int i;
8282 int need_dict = 0;
8283 unsigned char level1[32];
8284 unsigned char level2[512];
8285 unsigned char *mlevel1, *mlevel2, *mlevel3;
8286 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008288 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008289 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008291
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008292 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008293 PyErr_BadArgument();
8294 return NULL;
8295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296 kind = PyUnicode_KIND(string);
8297 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008298 length = PyUnicode_GET_LENGTH(string);
8299 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008300 memset(level1, 0xFF, sizeof level1);
8301 memset(level2, 0xFF, sizeof level2);
8302
8303 /* If there isn't a one-to-one mapping of NULL to \0,
8304 or if there are non-BMP characters, we need to use
8305 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008306 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008308 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008310 ch = PyUnicode_READ(kind, data, i);
8311 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312 need_dict = 1;
8313 break;
8314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008316 /* unmapped character */
8317 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008318 l1 = ch >> 11;
8319 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320 if (level1[l1] == 0xFF)
8321 level1[l1] = count2++;
8322 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008323 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008324 }
8325
8326 if (count2 >= 0xFF || count3 >= 0xFF)
8327 need_dict = 1;
8328
8329 if (need_dict) {
8330 PyObject *result = PyDict_New();
8331 PyObject *key, *value;
8332 if (!result)
8333 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008334 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008336 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 if (!key || !value)
8338 goto failed1;
8339 if (PyDict_SetItem(result, key, value) == -1)
8340 goto failed1;
8341 Py_DECREF(key);
8342 Py_DECREF(value);
8343 }
8344 return result;
8345 failed1:
8346 Py_XDECREF(key);
8347 Py_XDECREF(value);
8348 Py_DECREF(result);
8349 return NULL;
8350 }
8351
8352 /* Create a three-level trie */
8353 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8354 16*count2 + 128*count3 - 1);
8355 if (!result)
8356 return PyErr_NoMemory();
8357 PyObject_Init(result, &EncodingMapType);
8358 mresult = (struct encoding_map*)result;
8359 mresult->count2 = count2;
8360 mresult->count3 = count3;
8361 mlevel1 = mresult->level1;
8362 mlevel2 = mresult->level23;
8363 mlevel3 = mresult->level23 + 16*count2;
8364 memcpy(mlevel1, level1, 32);
8365 memset(mlevel2, 0xFF, 16*count2);
8366 memset(mlevel3, 0, 128*count3);
8367 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008368 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008369 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008370 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8371 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 /* unmapped character */
8373 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008374 o1 = ch>>11;
8375 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008376 i2 = 16*mlevel1[o1] + o2;
8377 if (mlevel2[i2] == 0xFF)
8378 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008379 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008380 i3 = 128*mlevel2[i2] + o3;
8381 mlevel3[i3] = i;
8382 }
8383 return result;
8384}
8385
8386static int
Victor Stinner22168992011-11-20 17:09:18 +01008387encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008388{
8389 struct encoding_map *map = (struct encoding_map*)mapping;
8390 int l1 = c>>11;
8391 int l2 = (c>>7) & 0xF;
8392 int l3 = c & 0x7F;
8393 int i;
8394
Victor Stinner22168992011-11-20 17:09:18 +01008395 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008397 if (c == 0)
8398 return 0;
8399 /* level 1*/
8400 i = map->level1[l1];
8401 if (i == 0xFF) {
8402 return -1;
8403 }
8404 /* level 2*/
8405 i = map->level23[16*i+l2];
8406 if (i == 0xFF) {
8407 return -1;
8408 }
8409 /* level 3 */
8410 i = map->level23[16*map->count2 + 128*i + l3];
8411 if (i == 0) {
8412 return -1;
8413 }
8414 return i;
8415}
8416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417/* Lookup the character ch in the mapping. If the character
8418 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008419 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008420static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008421charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422{
Christian Heimes217cfd12007-12-02 14:31:20 +00008423 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 PyObject *x;
8425
8426 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 x = PyObject_GetItem(mapping, w);
8429 Py_DECREF(w);
8430 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8432 /* No mapping found means: mapping is undefined. */
8433 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008434 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 } else
8436 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008438 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008440 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 long value = PyLong_AS_LONG(x);
8442 if (value < 0 || value > 255) {
8443 PyErr_SetString(PyExc_TypeError,
8444 "character mapping must be in range(256)");
8445 Py_DECREF(x);
8446 return NULL;
8447 }
8448 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008450 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 /* wrong return value */
8454 PyErr_Format(PyExc_TypeError,
8455 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008456 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 Py_DECREF(x);
8458 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459 }
8460}
8461
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008462static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008463charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008464{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008465 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8466 /* exponentially overallocate to minimize reallocations */
8467 if (requiredsize < 2*outsize)
8468 requiredsize = 2*outsize;
8469 if (_PyBytes_Resize(outobj, requiredsize))
8470 return -1;
8471 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008472}
8473
Benjamin Peterson14339b62009-01-31 16:36:08 +00008474typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008476} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008477/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008478 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008479 space is available. Return a new reference to the object that
8480 was put in the output buffer, or Py_None, if the mapping was undefined
8481 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008482 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008483static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008484charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008485 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008487 PyObject *rep;
8488 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008489 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490
Andy Lesterdffe4c02020-03-04 07:15:20 -06008491 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008492 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008494 if (res == -1)
8495 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 if (outsize<requiredsize)
8497 if (charmapencode_resize(outobj, outpos, requiredsize))
8498 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008499 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 outstart[(*outpos)++] = (char)res;
8501 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008502 }
8503
8504 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008507 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 Py_DECREF(rep);
8509 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008510 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 if (PyLong_Check(rep)) {
8512 Py_ssize_t requiredsize = *outpos+1;
8513 if (outsize<requiredsize)
8514 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8515 Py_DECREF(rep);
8516 return enc_EXCEPTION;
8517 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008518 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008520 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 else {
8522 const char *repchars = PyBytes_AS_STRING(rep);
8523 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8524 Py_ssize_t requiredsize = *outpos+repsize;
8525 if (outsize<requiredsize)
8526 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8527 Py_DECREF(rep);
8528 return enc_EXCEPTION;
8529 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008530 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 memcpy(outstart + *outpos, repchars, repsize);
8532 *outpos += repsize;
8533 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008535 Py_DECREF(rep);
8536 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537}
8538
8539/* handle an error in PyUnicode_EncodeCharmap
8540 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008541static int
8542charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008543 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008545 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008546 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547{
8548 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008549 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008550 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008551 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008552 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008553 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008555 Py_ssize_t collstartpos = *inpos;
8556 Py_ssize_t collendpos = *inpos+1;
8557 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008558 const char *encoding = "charmap";
8559 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008560 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008561 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008562 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563
Benjamin Petersonbac79492012-01-14 13:34:47 -05008564 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008565 return -1;
8566 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 /* find all unencodable characters */
8568 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008569 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008570 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008571 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008572 val = encoding_map_lookup(ch, mapping);
8573 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 break;
8575 ++collendpos;
8576 continue;
8577 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008578
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008579 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8580 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 if (rep==NULL)
8582 return -1;
8583 else if (rep!=Py_None) {
8584 Py_DECREF(rep);
8585 break;
8586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008587 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 }
8590 /* cache callback name lookup
8591 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008592 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008593 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008594
8595 switch (*error_handler) {
8596 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008597 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008598 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008599
8600 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008601 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 x = charmapencode_output('?', mapping, res, respos);
8603 if (x==enc_EXCEPTION) {
8604 return -1;
8605 }
8606 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008607 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 return -1;
8609 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008610 }
8611 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008612 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008613 *inpos = collendpos;
8614 break;
Victor Stinner50149202015-09-22 00:26:54 +02008615
8616 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008617 /* generate replacement (temporarily (mis)uses p) */
8618 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 char buffer[2+29+1+1];
8620 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008621 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 for (cp = buffer; *cp; ++cp) {
8623 x = charmapencode_output(*cp, mapping, res, respos);
8624 if (x==enc_EXCEPTION)
8625 return -1;
8626 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008627 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 return -1;
8629 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008630 }
8631 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008632 *inpos = collendpos;
8633 break;
Victor Stinner50149202015-09-22 00:26:54 +02008634
Benjamin Peterson14339b62009-01-31 16:36:08 +00008635 default:
Victor Stinner50149202015-09-22 00:26:54 +02008636 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008637 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008639 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008641 if (PyBytes_Check(repunicode)) {
8642 /* Directly copy bytes result to output. */
8643 Py_ssize_t outsize = PyBytes_Size(*res);
8644 Py_ssize_t requiredsize;
8645 repsize = PyBytes_Size(repunicode);
8646 requiredsize = *respos + repsize;
8647 if (requiredsize > outsize)
8648 /* Make room for all additional bytes. */
8649 if (charmapencode_resize(res, respos, requiredsize)) {
8650 Py_DECREF(repunicode);
8651 return -1;
8652 }
8653 memcpy(PyBytes_AsString(*res) + *respos,
8654 PyBytes_AsString(repunicode), repsize);
8655 *respos += repsize;
8656 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008657 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008658 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008659 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008660 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008661 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008662 Py_DECREF(repunicode);
8663 return -1;
8664 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008665 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008666 data = PyUnicode_DATA(repunicode);
8667 kind = PyUnicode_KIND(repunicode);
8668 for (index = 0; index < repsize; index++) {
8669 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8670 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008672 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return -1;
8674 }
8675 else if (x==enc_FAILED) {
8676 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008677 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 return -1;
8679 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008680 }
8681 *inpos = newpos;
8682 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 }
8684 return 0;
8685}
8686
Alexander Belopolsky40018472011-02-26 01:02:56 +00008687PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008688_PyUnicode_EncodeCharmap(PyObject *unicode,
8689 PyObject *mapping,
8690 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 /* output object */
8693 PyObject *res = NULL;
8694 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008695 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008696 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008698 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008699 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008701 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008702 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008703 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704
Benjamin Petersonbac79492012-01-14 13:34:47 -05008705 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008706 return NULL;
8707 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008708 data = PyUnicode_DATA(unicode);
8709 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008710
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 /* Default to Latin-1 */
8712 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008713 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715 /* allocate enough for a simple encoding without
8716 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008717 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 if (res == NULL)
8719 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008720 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008724 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008726 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 if (x==enc_EXCEPTION) /* error */
8728 goto onError;
8729 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008730 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008732 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 &res, &respos)) {
8734 goto onError;
8735 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008736 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 else
8738 /* done with this character => adjust input position */
8739 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008743 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008744 if (_PyBytes_Resize(&res, respos) < 0)
8745 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008746
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008747 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008748 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749 return res;
8750
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008752 Py_XDECREF(res);
8753 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008754 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755 return NULL;
8756}
8757
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008758/* Deprecated */
8759PyObject *
8760PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8761 Py_ssize_t size,
8762 PyObject *mapping,
8763 const char *errors)
8764{
8765 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008766 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008767 if (unicode == NULL)
8768 return NULL;
8769 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8770 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008771 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008772}
8773
Alexander Belopolsky40018472011-02-26 01:02:56 +00008774PyObject *
8775PyUnicode_AsCharmapString(PyObject *unicode,
8776 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777{
8778 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 PyErr_BadArgument();
8780 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008782 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783}
8784
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008786static void
8787make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008789 Py_ssize_t startpos, Py_ssize_t endpos,
8790 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 *exceptionObject = _PyUnicodeTranslateError_Create(
8794 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795 }
8796 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8798 goto onError;
8799 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8800 goto onError;
8801 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8802 goto onError;
8803 return;
8804 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008805 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008806 }
8807}
8808
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008809/* error handling callback helper:
8810 build arguments, call the callback and check the arguments,
8811 put the result into newpos and return the replacement string, which
8812 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008813static PyObject *
8814unicode_translate_call_errorhandler(const char *errors,
8815 PyObject **errorHandler,
8816 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008818 Py_ssize_t startpos, Py_ssize_t endpos,
8819 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008820{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008821 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008822
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008823 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008824 PyObject *restuple;
8825 PyObject *resunicode;
8826
8827 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008829 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008831 }
8832
8833 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008835 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008837
Petr Viktorinffd97532020-02-11 17:46:57 +01008838 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008839 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008841 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008842 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 Py_DECREF(restuple);
8844 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008845 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008846 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008847 &resunicode, &i_newpos)) {
8848 Py_DECREF(restuple);
8849 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008850 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008851 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008853 else
8854 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008856 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008857 Py_DECREF(restuple);
8858 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008859 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008860 Py_INCREF(resunicode);
8861 Py_DECREF(restuple);
8862 return resunicode;
8863}
8864
8865/* Lookup the character ch in the mapping and put the result in result,
8866 which must be decrefed by the caller.
8867 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008868static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008869charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870{
Christian Heimes217cfd12007-12-02 14:31:20 +00008871 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008872 PyObject *x;
8873
8874 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008876 x = PyObject_GetItem(mapping, w);
8877 Py_DECREF(w);
8878 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8880 /* No mapping found means: use 1:1 mapping. */
8881 PyErr_Clear();
8882 *result = NULL;
8883 return 0;
8884 } else
8885 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008886 }
8887 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 *result = x;
8889 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008890 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008891 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008893 if (value < 0 || value > MAX_UNICODE) {
8894 PyErr_Format(PyExc_ValueError,
8895 "character mapping must be in range(0x%x)",
8896 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 Py_DECREF(x);
8898 return -1;
8899 }
8900 *result = x;
8901 return 0;
8902 }
8903 else if (PyUnicode_Check(x)) {
8904 *result = x;
8905 return 0;
8906 }
8907 else {
8908 /* wrong return value */
8909 PyErr_SetString(PyExc_TypeError,
8910 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008911 Py_DECREF(x);
8912 return -1;
8913 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008914}
Victor Stinner1194ea02014-04-04 19:37:40 +02008915
8916/* lookup the character, write the result into the writer.
8917 Return 1 if the result was written into the writer, return 0 if the mapping
8918 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008919static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008920charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8921 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008922{
Victor Stinner1194ea02014-04-04 19:37:40 +02008923 PyObject *item;
8924
8925 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008927
8928 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008930 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008932 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008933 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008934 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008935
8936 if (item == Py_None) {
8937 Py_DECREF(item);
8938 return 0;
8939 }
8940
8941 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008942 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8943 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8944 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008945 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8946 Py_DECREF(item);
8947 return -1;
8948 }
8949 Py_DECREF(item);
8950 return 1;
8951 }
8952
8953 if (!PyUnicode_Check(item)) {
8954 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008956 }
8957
8958 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8959 Py_DECREF(item);
8960 return -1;
8961 }
8962
8963 Py_DECREF(item);
8964 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008965}
8966
Victor Stinner89a76ab2014-04-05 11:44:04 +02008967static int
8968unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8969 Py_UCS1 *translate)
8970{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008971 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008972 int ret = 0;
8973
Victor Stinner89a76ab2014-04-05 11:44:04 +02008974 if (charmaptranslate_lookup(ch, mapping, &item)) {
8975 return -1;
8976 }
8977
8978 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008979 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008980 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008981 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008982 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008983 /* not found => default to 1:1 mapping */
8984 translate[ch] = ch;
8985 return 1;
8986 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008987 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008988 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008989 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8990 used it */
8991 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008992 /* invalid character or character outside ASCII:
8993 skip the fast translate */
8994 goto exit;
8995 }
8996 translate[ch] = (Py_UCS1)replace;
8997 }
8998 else if (PyUnicode_Check(item)) {
8999 Py_UCS4 replace;
9000
9001 if (PyUnicode_READY(item) == -1) {
9002 Py_DECREF(item);
9003 return -1;
9004 }
9005 if (PyUnicode_GET_LENGTH(item) != 1)
9006 goto exit;
9007
9008 replace = PyUnicode_READ_CHAR(item, 0);
9009 if (replace > 127)
9010 goto exit;
9011 translate[ch] = (Py_UCS1)replace;
9012 }
9013 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009014 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009015 goto exit;
9016 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009017 ret = 1;
9018
Benjamin Peterson1365de72014-04-07 20:15:41 -04009019 exit:
9020 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009021 return ret;
9022}
9023
9024/* Fast path for ascii => ascii translation. Return 1 if the whole string
9025 was translated into writer, return 0 if the input string was partially
9026 translated into writer, raise an exception and return -1 on error. */
9027static int
9028unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009029 _PyUnicodeWriter *writer, int ignore,
9030 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009031{
Victor Stinner872b2912014-04-05 14:27:07 +02009032 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009033 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009034 const Py_UCS1 *in, *end;
9035 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009036 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009037
Victor Stinner89a76ab2014-04-05 11:44:04 +02009038 len = PyUnicode_GET_LENGTH(input);
9039
Victor Stinner872b2912014-04-05 14:27:07 +02009040 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009041
9042 in = PyUnicode_1BYTE_DATA(input);
9043 end = in + len;
9044
9045 assert(PyUnicode_IS_ASCII(writer->buffer));
9046 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9047 out = PyUnicode_1BYTE_DATA(writer->buffer);
9048
Victor Stinner872b2912014-04-05 14:27:07 +02009049 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009050 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009051 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009052 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009053 int translate = unicode_fast_translate_lookup(mapping, ch,
9054 ascii_table);
9055 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009056 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009057 if (translate == 0)
9058 goto exit;
9059 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009060 }
Victor Stinner872b2912014-04-05 14:27:07 +02009061 if (ch2 == 0xfe) {
9062 if (ignore)
9063 continue;
9064 goto exit;
9065 }
9066 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009067 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009068 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009069 }
Victor Stinner872b2912014-04-05 14:27:07 +02009070 res = 1;
9071
9072exit:
9073 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009074 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009075 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009076}
9077
Victor Stinner3222da22015-10-01 22:07:32 +02009078static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079_PyUnicode_TranslateCharmap(PyObject *input,
9080 PyObject *mapping,
9081 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009084 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 Py_ssize_t size, i;
9086 int kind;
9087 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009088 _PyUnicodeWriter writer;
9089 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009090 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009091 PyObject *errorHandler = NULL;
9092 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009093 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009094 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009095
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 PyErr_BadArgument();
9098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 if (PyUnicode_READY(input) == -1)
9102 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009103 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 kind = PyUnicode_KIND(input);
9105 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009107 if (size == 0)
9108 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009110 /* allocate enough for a simple 1:1 translation without
9111 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009112 _PyUnicodeWriter_Init(&writer);
9113 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115
Victor Stinner872b2912014-04-05 14:27:07 +02009116 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9117
Victor Stinner33798672016-03-01 21:59:58 +01009118 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009119 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009120 if (PyUnicode_IS_ASCII(input)) {
9121 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9122 if (res < 0) {
9123 _PyUnicodeWriter_Dealloc(&writer);
9124 return NULL;
9125 }
9126 if (res == 1)
9127 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009128 }
Victor Stinner33798672016-03-01 21:59:58 +01009129 else {
9130 i = 0;
9131 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009135 int translate;
9136 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9137 Py_ssize_t newpos;
9138 /* startpos for collecting untranslatable chars */
9139 Py_ssize_t collstart;
9140 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009141 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142
Victor Stinner1194ea02014-04-04 19:37:40 +02009143 ch = PyUnicode_READ(kind, data, i);
9144 translate = charmaptranslate_output(ch, mapping, &writer);
9145 if (translate < 0)
9146 goto onError;
9147
9148 if (translate != 0) {
9149 /* it worked => adjust input pointer */
9150 ++i;
9151 continue;
9152 }
9153
9154 /* untranslatable character */
9155 collstart = i;
9156 collend = i+1;
9157
9158 /* find all untranslatable characters */
9159 while (collend < size) {
9160 PyObject *x;
9161 ch = PyUnicode_READ(kind, data, collend);
9162 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009163 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009164 Py_XDECREF(x);
9165 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009167 ++collend;
9168 }
9169
9170 if (ignore) {
9171 i = collend;
9172 }
9173 else {
9174 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9175 reason, input, &exc,
9176 collstart, collend, &newpos);
9177 if (repunicode == NULL)
9178 goto onError;
9179 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009181 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009182 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009183 Py_DECREF(repunicode);
9184 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009185 }
9186 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009187 Py_XDECREF(exc);
9188 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009189 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009190
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009192 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009193 Py_XDECREF(exc);
9194 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195 return NULL;
9196}
9197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198/* Deprecated. Use PyUnicode_Translate instead. */
9199PyObject *
9200PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9201 Py_ssize_t size,
9202 PyObject *mapping,
9203 const char *errors)
9204{
Christian Heimes5f520f42012-09-11 14:03:25 +02009205 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009206 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 if (!unicode)
9208 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009209 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9210 Py_DECREF(unicode);
9211 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212}
9213
Alexander Belopolsky40018472011-02-26 01:02:56 +00009214PyObject *
9215PyUnicode_Translate(PyObject *str,
9216 PyObject *mapping,
9217 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009219 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009220 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009221 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222}
Tim Petersced69f82003-09-16 20:30:58 +00009223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224PyObject *
9225_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9226{
9227 if (!PyUnicode_Check(unicode)) {
9228 PyErr_BadInternalCall();
9229 return NULL;
9230 }
9231 if (PyUnicode_READY(unicode) == -1)
9232 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009233 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009234 /* If the string is already ASCII, just return the same string */
9235 Py_INCREF(unicode);
9236 return unicode;
9237 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009238
9239 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9240 PyObject *result = PyUnicode_New(len, 127);
9241 if (result == NULL) {
9242 return NULL;
9243 }
9244
9245 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9246 int kind = PyUnicode_KIND(unicode);
9247 const void *data = PyUnicode_DATA(unicode);
9248 Py_ssize_t i;
9249 for (i = 0; i < len; ++i) {
9250 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9251 if (ch < 127) {
9252 out[i] = ch;
9253 }
9254 else if (Py_UNICODE_ISSPACE(ch)) {
9255 out[i] = ' ';
9256 }
9257 else {
9258 int decimal = Py_UNICODE_TODECIMAL(ch);
9259 if (decimal < 0) {
9260 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009261 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009262 _PyUnicode_LENGTH(result) = i + 1;
9263 break;
9264 }
9265 out[i] = '0' + decimal;
9266 }
9267 }
9268
INADA Naoki16dfca42018-07-14 12:06:43 +09009269 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009270 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271}
9272
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009273PyObject *
9274PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9275 Py_ssize_t length)
9276{
Victor Stinnerf0124502011-11-21 23:12:56 +01009277 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009278 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009279 Py_UCS4 maxchar;
9280 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009281 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009282
Victor Stinner99d7ad02012-02-22 13:37:39 +01009283 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009284 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009285 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009286 if (ch > 127) {
9287 int decimal = Py_UNICODE_TODECIMAL(ch);
9288 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009289 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009290 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009291 }
9292 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009293
9294 /* Copy to a new string */
9295 decimal = PyUnicode_New(length, maxchar);
9296 if (decimal == NULL)
9297 return decimal;
9298 kind = PyUnicode_KIND(decimal);
9299 data = PyUnicode_DATA(decimal);
9300 /* Iterate over code points */
9301 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009302 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009303 if (ch > 127) {
9304 int decimal = Py_UNICODE_TODECIMAL(ch);
9305 if (decimal >= 0)
9306 ch = '0' + decimal;
9307 }
9308 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009310 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009311}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009312/* --- Decimal Encoder ---------------------------------------------------- */
9313
Alexander Belopolsky40018472011-02-26 01:02:56 +00009314int
9315PyUnicode_EncodeDecimal(Py_UNICODE *s,
9316 Py_ssize_t length,
9317 char *output,
9318 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009319{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009320 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009321 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009322 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009323 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009324
9325 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 PyErr_BadArgument();
9327 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009328 }
9329
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009330 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009331 if (unicode == NULL)
9332 return -1;
9333
Victor Stinner42bf7752011-11-21 22:52:58 +01009334 kind = PyUnicode_KIND(unicode);
9335 data = PyUnicode_DATA(unicode);
9336
Victor Stinnerb84d7232011-11-22 01:50:07 +01009337 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009338 PyObject *exc;
9339 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009340 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009341 Py_ssize_t startpos;
9342
9343 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009344
Benjamin Peterson29060642009-01-31 22:14:21 +00009345 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009346 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009347 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009348 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009349 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009350 decimal = Py_UNICODE_TODECIMAL(ch);
9351 if (decimal >= 0) {
9352 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009353 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 continue;
9355 }
9356 if (0 < ch && ch < 256) {
9357 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009358 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 continue;
9360 }
Victor Stinner6345be92011-11-25 20:09:01 +01009361
Victor Stinner42bf7752011-11-21 22:52:58 +01009362 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009363 exc = NULL;
9364 raise_encode_exception(&exc, "decimal", unicode,
9365 startpos, startpos+1,
9366 "invalid decimal Unicode string");
9367 Py_XDECREF(exc);
9368 Py_DECREF(unicode);
9369 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009370 }
9371 /* 0-terminate the output string */
9372 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009373 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009374 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009375}
9376
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377/* --- Helpers ------------------------------------------------------------ */
9378
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009379/* helper macro to fixup start/end slice values */
9380#define ADJUST_INDICES(start, end, len) \
9381 if (end > len) \
9382 end = len; \
9383 else if (end < 0) { \
9384 end += len; \
9385 if (end < 0) \
9386 end = 0; \
9387 } \
9388 if (start < 0) { \
9389 start += len; \
9390 if (start < 0) \
9391 start = 0; \
9392 }
9393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009395any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009397 Py_ssize_t end,
9398 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009400 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009401 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 Py_ssize_t len1, len2, result;
9403
9404 kind1 = PyUnicode_KIND(s1);
9405 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009406 if (kind1 < kind2)
9407 return -1;
9408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 len1 = PyUnicode_GET_LENGTH(s1);
9410 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009411 ADJUST_INDICES(start, end, len1);
9412 if (end - start < len2)
9413 return -1;
9414
9415 buf1 = PyUnicode_DATA(s1);
9416 buf2 = PyUnicode_DATA(s2);
9417 if (len2 == 1) {
9418 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9419 result = findchar((const char *)buf1 + kind1*start,
9420 kind1, end - start, ch, direction);
9421 if (result == -1)
9422 return -1;
9423 else
9424 return start + result;
9425 }
9426
9427 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009428 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429 if (!buf2)
9430 return -2;
9431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432
Victor Stinner794d5672011-10-10 03:21:36 +02009433 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009434 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009435 case PyUnicode_1BYTE_KIND:
9436 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9437 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9438 else
9439 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9440 break;
9441 case PyUnicode_2BYTE_KIND:
9442 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9443 break;
9444 case PyUnicode_4BYTE_KIND:
9445 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9446 break;
9447 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009448 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009449 }
9450 }
9451 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009452 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009453 case PyUnicode_1BYTE_KIND:
9454 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9455 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9456 else
9457 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9458 break;
9459 case PyUnicode_2BYTE_KIND:
9460 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9461 break;
9462 case PyUnicode_4BYTE_KIND:
9463 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9464 break;
9465 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009466 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 }
9469
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009470 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009471 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009472 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473
9474 return result;
9475}
9476
Victor Stinner59423e32018-11-26 13:40:01 +01009477/* _PyUnicode_InsertThousandsGrouping() helper functions */
9478#include "stringlib/localeutil.h"
9479
9480/**
9481 * InsertThousandsGrouping:
9482 * @writer: Unicode writer.
9483 * @n_buffer: Number of characters in @buffer.
9484 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9485 * @d_pos: Start of digits string.
9486 * @n_digits: The number of digits in the string, in which we want
9487 * to put the grouping chars.
9488 * @min_width: The minimum width of the digits in the output string.
9489 * Output will be zero-padded on the left to fill.
9490 * @grouping: see definition in localeconv().
9491 * @thousands_sep: see definition in localeconv().
9492 *
9493 * There are 2 modes: counting and filling. If @writer is NULL,
9494 * we are in counting mode, else filling mode.
9495 * If counting, the required buffer size is returned.
9496 * If filling, we know the buffer will be large enough, so we don't
9497 * need to pass in the buffer size.
9498 * Inserts thousand grouping characters (as defined by grouping and
9499 * thousands_sep) into @writer.
9500 *
9501 * Return value: -1 on error, number of characters otherwise.
9502 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009504_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009505 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009506 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009507 PyObject *digits,
9508 Py_ssize_t d_pos,
9509 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009510 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009511 const char *grouping,
9512 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009513 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514{
Xtreak3f7983a2019-01-07 20:39:14 +05309515 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009516 if (writer) {
9517 assert(digits != NULL);
9518 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009519 }
9520 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009521 assert(digits == NULL);
9522 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009523 }
Victor Stinner59423e32018-11-26 13:40:01 +01009524 assert(0 <= d_pos);
9525 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009526 assert(grouping != NULL);
9527
9528 if (digits != NULL) {
9529 if (PyUnicode_READY(digits) == -1) {
9530 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009531 }
Victor Stinner59423e32018-11-26 13:40:01 +01009532 }
9533 if (PyUnicode_READY(thousands_sep) == -1) {
9534 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009535 }
9536
Victor Stinner59423e32018-11-26 13:40:01 +01009537 Py_ssize_t count = 0;
9538 Py_ssize_t n_zeros;
9539 int loop_broken = 0;
9540 int use_separator = 0; /* First time through, don't append the
9541 separator. They only go between
9542 groups. */
9543 Py_ssize_t buffer_pos;
9544 Py_ssize_t digits_pos;
9545 Py_ssize_t len;
9546 Py_ssize_t n_chars;
9547 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9548 be looked at */
9549 /* A generator that returns all of the grouping widths, until it
9550 returns 0. */
9551 GroupGenerator groupgen;
9552 GroupGenerator_init(&groupgen, grouping);
9553 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9554
9555 /* if digits are not grouped, thousands separator
9556 should be an empty string */
9557 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9558
9559 digits_pos = d_pos + n_digits;
9560 if (writer) {
9561 buffer_pos = writer->pos + n_buffer;
9562 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9563 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 }
Victor Stinner59423e32018-11-26 13:40:01 +01009565 else {
9566 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009567 }
Victor Stinner59423e32018-11-26 13:40:01 +01009568
9569 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009570 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009571 }
Victor Stinner59423e32018-11-26 13:40:01 +01009572
9573 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9574 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9575 n_zeros = Py_MAX(0, len - remaining);
9576 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9577
9578 /* Use n_zero zero's and n_chars chars */
9579
9580 /* Count only, don't do anything. */
9581 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9582
9583 /* Copy into the writer. */
9584 InsertThousandsGrouping_fill(writer, &buffer_pos,
9585 digits, &digits_pos,
9586 n_chars, n_zeros,
9587 use_separator ? thousands_sep : NULL,
9588 thousands_sep_len, maxchar);
9589
9590 /* Use a separator next time. */
9591 use_separator = 1;
9592
9593 remaining -= n_chars;
9594 min_width -= len;
9595
9596 if (remaining <= 0 && min_width <= 0) {
9597 loop_broken = 1;
9598 break;
9599 }
9600 min_width -= thousands_sep_len;
9601 }
9602 if (!loop_broken) {
9603 /* We left the loop without using a break statement. */
9604
9605 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9606 n_zeros = Py_MAX(0, len - remaining);
9607 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9608
9609 /* Use n_zero zero's and n_chars chars */
9610 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9611
9612 /* Copy into the writer. */
9613 InsertThousandsGrouping_fill(writer, &buffer_pos,
9614 digits, &digits_pos,
9615 n_chars, n_zeros,
9616 use_separator ? thousands_sep : NULL,
9617 thousands_sep_len, maxchar);
9618 }
9619 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620}
9621
9622
Alexander Belopolsky40018472011-02-26 01:02:56 +00009623Py_ssize_t
9624PyUnicode_Count(PyObject *str,
9625 PyObject *substr,
9626 Py_ssize_t start,
9627 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009629 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009630 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009631 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009633
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009634 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009635 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009636
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009637 kind1 = PyUnicode_KIND(str);
9638 kind2 = PyUnicode_KIND(substr);
9639 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009640 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009641
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009642 len1 = PyUnicode_GET_LENGTH(str);
9643 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009645 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009646 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009647
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009648 buf1 = PyUnicode_DATA(str);
9649 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009650 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009651 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009652 if (!buf2)
9653 goto onError;
9654 }
9655
9656 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009658 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009659 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009660 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009661 buf2, len2, PY_SSIZE_T_MAX
9662 );
9663 else
9664 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009665 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009666 buf2, len2, PY_SSIZE_T_MAX
9667 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 break;
9669 case PyUnicode_2BYTE_KIND:
9670 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009671 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 buf2, len2, PY_SSIZE_T_MAX
9673 );
9674 break;
9675 case PyUnicode_4BYTE_KIND:
9676 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009677 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 buf2, len2, PY_SSIZE_T_MAX
9679 );
9680 break;
9681 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009682 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009684
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009685 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009686 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009687 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009688
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009691 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9692 if (kind2 != kind1)
9693 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695}
9696
Alexander Belopolsky40018472011-02-26 01:02:56 +00009697Py_ssize_t
9698PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009699 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009700 Py_ssize_t start,
9701 Py_ssize_t end,
9702 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009704 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009705 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009706
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009707 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708}
9709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710Py_ssize_t
9711PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9712 Py_ssize_t start, Py_ssize_t end,
9713 int direction)
9714{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009716 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 if (PyUnicode_READY(str) == -1)
9718 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009719 len = PyUnicode_GET_LENGTH(str);
9720 ADJUST_INDICES(start, end, len);
9721 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009722 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009724 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9725 kind, end-start, ch, direction);
9726 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009728 else
9729 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730}
9731
Alexander Belopolsky40018472011-02-26 01:02:56 +00009732static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009733tailmatch(PyObject *self,
9734 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009735 Py_ssize_t start,
9736 Py_ssize_t end,
9737 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 int kind_self;
9740 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009741 const void *data_self;
9742 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743 Py_ssize_t offset;
9744 Py_ssize_t i;
9745 Py_ssize_t end_sub;
9746
9747 if (PyUnicode_READY(self) == -1 ||
9748 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009749 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9752 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009754 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009756 if (PyUnicode_GET_LENGTH(substring) == 0)
9757 return 1;
9758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009759 kind_self = PyUnicode_KIND(self);
9760 data_self = PyUnicode_DATA(self);
9761 kind_sub = PyUnicode_KIND(substring);
9762 data_sub = PyUnicode_DATA(substring);
9763 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9764
9765 if (direction > 0)
9766 offset = end;
9767 else
9768 offset = start;
9769
9770 if (PyUnicode_READ(kind_self, data_self, offset) ==
9771 PyUnicode_READ(kind_sub, data_sub, 0) &&
9772 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9773 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9774 /* If both are of the same kind, memcmp is sufficient */
9775 if (kind_self == kind_sub) {
9776 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009777 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 data_sub,
9779 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009780 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009782 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 else {
9784 /* We do not need to compare 0 and len(substring)-1 because
9785 the if statement above ensured already that they are equal
9786 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 for (i = 1; i < end_sub; ++i) {
9788 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9789 PyUnicode_READ(kind_sub, data_sub, i))
9790 return 0;
9791 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009792 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794 }
9795
9796 return 0;
9797}
9798
Alexander Belopolsky40018472011-02-26 01:02:56 +00009799Py_ssize_t
9800PyUnicode_Tailmatch(PyObject *str,
9801 PyObject *substr,
9802 Py_ssize_t start,
9803 Py_ssize_t end,
9804 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009806 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009807 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009808
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009809 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810}
9811
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009812static PyObject *
9813ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009815 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009816 const char *data = PyUnicode_DATA(self);
9817 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009818 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009819
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009820 res = PyUnicode_New(len, 127);
9821 if (res == NULL)
9822 return NULL;
9823 resdata = PyUnicode_DATA(res);
9824 if (lower)
9825 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009827 _Py_bytes_upper(resdata, data, len);
9828 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829}
9830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009832handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009834 Py_ssize_t j;
9835 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009836 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009837 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009838
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009839 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9840
9841 where ! is a negation and \p{xxx} is a character with property xxx.
9842 */
9843 for (j = i - 1; j >= 0; j--) {
9844 c = PyUnicode_READ(kind, data, j);
9845 if (!_PyUnicode_IsCaseIgnorable(c))
9846 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009848 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9849 if (final_sigma) {
9850 for (j = i + 1; j < length; j++) {
9851 c = PyUnicode_READ(kind, data, j);
9852 if (!_PyUnicode_IsCaseIgnorable(c))
9853 break;
9854 }
9855 final_sigma = j == length || !_PyUnicode_IsCased(c);
9856 }
9857 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858}
9859
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009860static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009861lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009862 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009864 /* Obscure special case. */
9865 if (c == 0x3A3) {
9866 mapped[0] = handle_capital_sigma(kind, data, length, i);
9867 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009869 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870}
9871
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009872static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009873do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009875 Py_ssize_t i, k = 0;
9876 int n_res, j;
9877 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009878
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009879 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009880 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009881 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009882 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009883 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009885 for (i = 1; i < length; i++) {
9886 c = PyUnicode_READ(kind, data, i);
9887 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9888 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009889 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009890 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009891 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009892 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009893 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894}
9895
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009896static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009897do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009898 Py_ssize_t i, k = 0;
9899
9900 for (i = 0; i < length; i++) {
9901 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9902 int n_res, j;
9903 if (Py_UNICODE_ISUPPER(c)) {
9904 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9905 }
9906 else if (Py_UNICODE_ISLOWER(c)) {
9907 n_res = _PyUnicode_ToUpperFull(c, mapped);
9908 }
9909 else {
9910 n_res = 1;
9911 mapped[0] = c;
9912 }
9913 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009914 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009915 res[k++] = mapped[j];
9916 }
9917 }
9918 return k;
9919}
9920
9921static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009922do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009923 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009925 Py_ssize_t i, k = 0;
9926
9927 for (i = 0; i < length; i++) {
9928 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9929 int n_res, j;
9930 if (lower)
9931 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9932 else
9933 n_res = _PyUnicode_ToUpperFull(c, mapped);
9934 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009935 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009936 res[k++] = mapped[j];
9937 }
9938 }
9939 return k;
9940}
9941
9942static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009943do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009944{
9945 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9946}
9947
9948static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009949do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009950{
9951 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9952}
9953
Benjamin Petersone51757f2012-01-12 21:10:29 -05009954static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009955do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -05009956{
9957 Py_ssize_t i, k = 0;
9958
9959 for (i = 0; i < length; i++) {
9960 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9961 Py_UCS4 mapped[3];
9962 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9963 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009964 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009965 res[k++] = mapped[j];
9966 }
9967 }
9968 return k;
9969}
9970
9971static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009972do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -05009973{
9974 Py_ssize_t i, k = 0;
9975 int previous_is_cased;
9976
9977 previous_is_cased = 0;
9978 for (i = 0; i < length; i++) {
9979 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9980 Py_UCS4 mapped[3];
9981 int n_res, j;
9982
9983 if (previous_is_cased)
9984 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9985 else
9986 n_res = _PyUnicode_ToTitleFull(c, mapped);
9987
9988 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009989 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009990 res[k++] = mapped[j];
9991 }
9992
9993 previous_is_cased = _PyUnicode_IsCased(c);
9994 }
9995 return k;
9996}
9997
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009998static PyObject *
9999case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010000 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010001{
10002 PyObject *res = NULL;
10003 Py_ssize_t length, newlength = 0;
10004 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010005 const void *data;
10006 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010007 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10008
Benjamin Petersoneea48462012-01-16 14:28:50 -050010009 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010010
10011 kind = PyUnicode_KIND(self);
10012 data = PyUnicode_DATA(self);
10013 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010014 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010015 PyErr_SetString(PyExc_OverflowError, "string is too long");
10016 return NULL;
10017 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010018 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010019 if (tmp == NULL)
10020 return PyErr_NoMemory();
10021 newlength = perform(kind, data, length, tmp, &maxchar);
10022 res = PyUnicode_New(newlength, maxchar);
10023 if (res == NULL)
10024 goto leave;
10025 tmpend = tmp + newlength;
10026 outdata = PyUnicode_DATA(res);
10027 outkind = PyUnicode_KIND(res);
10028 switch (outkind) {
10029 case PyUnicode_1BYTE_KIND:
10030 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10031 break;
10032 case PyUnicode_2BYTE_KIND:
10033 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10034 break;
10035 case PyUnicode_4BYTE_KIND:
10036 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10037 break;
10038 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010039 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010040 }
10041 leave:
10042 PyMem_FREE(tmp);
10043 return res;
10044}
10045
Tim Peters8ce9f162004-08-27 01:49:32 +000010046PyObject *
10047PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010048{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010049 PyObject *res;
10050 PyObject *fseq;
10051 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010052 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010054 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010055 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010056 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010057 }
10058
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010059 /* NOTE: the following code can't call back into Python code,
10060 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010061 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010062
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010063 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010064 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010065 res = _PyUnicode_JoinArray(separator, items, seqlen);
10066 Py_DECREF(fseq);
10067 return res;
10068}
10069
10070PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010071_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010072{
10073 PyObject *res = NULL; /* the result */
10074 PyObject *sep = NULL;
10075 Py_ssize_t seplen;
10076 PyObject *item;
10077 Py_ssize_t sz, i, res_offset;
10078 Py_UCS4 maxchar;
10079 Py_UCS4 item_maxchar;
10080 int use_memcpy;
10081 unsigned char *res_data = NULL, *sep_data = NULL;
10082 PyObject *last_obj;
10083 unsigned int kind = 0;
10084
Tim Peters05eba1f2004-08-27 21:32:02 +000010085 /* If empty sequence, return u"". */
10086 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010087 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010088 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010089
Tim Peters05eba1f2004-08-27 21:32:02 +000010090 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010091 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010092 if (seqlen == 1) {
10093 if (PyUnicode_CheckExact(items[0])) {
10094 res = items[0];
10095 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010096 return res;
10097 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010098 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010099 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010100 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010101 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010102 /* Set up sep and seplen */
10103 if (separator == NULL) {
10104 /* fall back to a blank space separator */
10105 sep = PyUnicode_FromOrdinal(' ');
10106 if (!sep)
10107 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010108 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010109 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010110 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010111 else {
10112 if (!PyUnicode_Check(separator)) {
10113 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010114 "separator: expected str instance,"
10115 " %.80s found",
10116 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010117 goto onError;
10118 }
10119 if (PyUnicode_READY(separator))
10120 goto onError;
10121 sep = separator;
10122 seplen = PyUnicode_GET_LENGTH(separator);
10123 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10124 /* inc refcount to keep this code path symmetric with the
10125 above case of a blank separator */
10126 Py_INCREF(sep);
10127 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010128 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010129 }
10130
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010131 /* There are at least two things to join, or else we have a subclass
10132 * of str in the sequence.
10133 * Do a pre-pass to figure out the total amount of space we'll
10134 * need (sz), and see whether all argument are strings.
10135 */
10136 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010137#ifdef Py_DEBUG
10138 use_memcpy = 0;
10139#else
10140 use_memcpy = 1;
10141#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010142 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010143 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010144 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010145 if (!PyUnicode_Check(item)) {
10146 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010147 "sequence item %zd: expected str instance,"
10148 " %.80s found",
10149 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010150 goto onError;
10151 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 if (PyUnicode_READY(item) == -1)
10153 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010154 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010156 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010157 if (i != 0) {
10158 add_sz += seplen;
10159 }
10160 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010161 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010162 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010163 goto onError;
10164 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010165 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010166 if (use_memcpy && last_obj != NULL) {
10167 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10168 use_memcpy = 0;
10169 }
10170 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010171 }
Tim Petersced69f82003-09-16 20:30:58 +000010172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010174 if (res == NULL)
10175 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010176
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010177 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010178#ifdef Py_DEBUG
10179 use_memcpy = 0;
10180#else
10181 if (use_memcpy) {
10182 res_data = PyUnicode_1BYTE_DATA(res);
10183 kind = PyUnicode_KIND(res);
10184 if (seplen != 0)
10185 sep_data = PyUnicode_1BYTE_DATA(sep);
10186 }
10187#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010188 if (use_memcpy) {
10189 for (i = 0; i < seqlen; ++i) {
10190 Py_ssize_t itemlen;
10191 item = items[i];
10192
10193 /* Copy item, and maybe the separator. */
10194 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010195 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010196 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010197 kind * seplen);
10198 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010199 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010200
10201 itemlen = PyUnicode_GET_LENGTH(item);
10202 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010203 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010204 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010205 kind * itemlen);
10206 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010207 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010208 }
10209 assert(res_data == PyUnicode_1BYTE_DATA(res)
10210 + kind * PyUnicode_GET_LENGTH(res));
10211 }
10212 else {
10213 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10214 Py_ssize_t itemlen;
10215 item = items[i];
10216
10217 /* Copy item, and maybe the separator. */
10218 if (i && seplen != 0) {
10219 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10220 res_offset += seplen;
10221 }
10222
10223 itemlen = PyUnicode_GET_LENGTH(item);
10224 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010225 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010226 res_offset += itemlen;
10227 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010228 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010229 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010230 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010233 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235
Benjamin Peterson29060642009-01-31 22:14:21 +000010236 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010238 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239 return NULL;
10240}
10241
Victor Stinnerd3f08822012-05-29 12:57:52 +020010242void
10243_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10244 Py_UCS4 fill_char)
10245{
10246 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010247 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010248 assert(PyUnicode_IS_READY(unicode));
10249 assert(unicode_modifiable(unicode));
10250 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10251 assert(start >= 0);
10252 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010253 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010254}
10255
Victor Stinner3fe55312012-01-04 00:33:50 +010010256Py_ssize_t
10257PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10258 Py_UCS4 fill_char)
10259{
10260 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010261
10262 if (!PyUnicode_Check(unicode)) {
10263 PyErr_BadInternalCall();
10264 return -1;
10265 }
10266 if (PyUnicode_READY(unicode) == -1)
10267 return -1;
10268 if (unicode_check_modifiable(unicode))
10269 return -1;
10270
Victor Stinnerd3f08822012-05-29 12:57:52 +020010271 if (start < 0) {
10272 PyErr_SetString(PyExc_IndexError, "string index out of range");
10273 return -1;
10274 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010275 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10276 PyErr_SetString(PyExc_ValueError,
10277 "fill character is bigger than "
10278 "the string maximum character");
10279 return -1;
10280 }
10281
10282 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10283 length = Py_MIN(maxlen, length);
10284 if (length <= 0)
10285 return 0;
10286
Victor Stinnerd3f08822012-05-29 12:57:52 +020010287 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010288 return length;
10289}
10290
Victor Stinner9310abb2011-10-05 00:59:23 +020010291static PyObject *
10292pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010293 Py_ssize_t left,
10294 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 PyObject *u;
10298 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010299 int kind;
10300 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301
10302 if (left < 0)
10303 left = 0;
10304 if (right < 0)
10305 right = 0;
10306
Victor Stinnerc4b49542011-12-11 22:44:26 +010010307 if (left == 0 && right == 0)
10308 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10311 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010312 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10313 return NULL;
10314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010316 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010318 if (!u)
10319 return NULL;
10320
10321 kind = PyUnicode_KIND(u);
10322 data = PyUnicode_DATA(u);
10323 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010324 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010325 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010326 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010327 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010328 assert(_PyUnicode_CheckConsistency(u, 1));
10329 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330}
10331
Alexander Belopolsky40018472011-02-26 01:02:56 +000010332PyObject *
10333PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010337 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339
Benjamin Petersonead6b532011-12-20 17:23:42 -060010340 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010342 if (PyUnicode_IS_ASCII(string))
10343 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010344 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010345 PyUnicode_GET_LENGTH(string), keepends);
10346 else
10347 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010348 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 break;
10351 case PyUnicode_2BYTE_KIND:
10352 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010353 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 PyUnicode_GET_LENGTH(string), keepends);
10355 break;
10356 case PyUnicode_4BYTE_KIND:
10357 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 PyUnicode_GET_LENGTH(string), keepends);
10360 break;
10361 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010362 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365}
10366
Alexander Belopolsky40018472011-02-26 01:02:56 +000010367static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010368split(PyObject *self,
10369 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010370 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010372 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010373 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 Py_ssize_t len1, len2;
10375 PyObject* out;
10376
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010378 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 if (PyUnicode_READY(self) == -1)
10381 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010384 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010386 if (PyUnicode_IS_ASCII(self))
10387 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010388 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010389 PyUnicode_GET_LENGTH(self), maxcount
10390 );
10391 else
10392 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010393 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010394 PyUnicode_GET_LENGTH(self), maxcount
10395 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 case PyUnicode_2BYTE_KIND:
10397 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010398 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 PyUnicode_GET_LENGTH(self), maxcount
10400 );
10401 case PyUnicode_4BYTE_KIND:
10402 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010403 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 PyUnicode_GET_LENGTH(self), maxcount
10405 );
10406 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010407 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 }
10409
10410 if (PyUnicode_READY(substring) == -1)
10411 return NULL;
10412
10413 kind1 = PyUnicode_KIND(self);
10414 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 len1 = PyUnicode_GET_LENGTH(self);
10416 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010417 if (kind1 < kind2 || len1 < len2) {
10418 out = PyList_New(1);
10419 if (out == NULL)
10420 return NULL;
10421 Py_INCREF(self);
10422 PyList_SET_ITEM(out, 0, self);
10423 return out;
10424 }
10425 buf1 = PyUnicode_DATA(self);
10426 buf2 = PyUnicode_DATA(substring);
10427 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010428 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010429 if (!buf2)
10430 return NULL;
10431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010433 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010435 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10436 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010437 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010438 else
10439 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010440 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 break;
10442 case PyUnicode_2BYTE_KIND:
10443 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010444 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 break;
10446 case PyUnicode_4BYTE_KIND:
10447 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010448 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 break;
10450 default:
10451 out = NULL;
10452 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010453 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010454 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010455 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457}
10458
Alexander Belopolsky40018472011-02-26 01:02:56 +000010459static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010460rsplit(PyObject *self,
10461 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010462 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010463{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010464 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010465 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 Py_ssize_t len1, len2;
10467 PyObject* out;
10468
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010469 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010470 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 if (PyUnicode_READY(self) == -1)
10473 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010476 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010478 if (PyUnicode_IS_ASCII(self))
10479 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010480 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010481 PyUnicode_GET_LENGTH(self), maxcount
10482 );
10483 else
10484 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010485 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010486 PyUnicode_GET_LENGTH(self), maxcount
10487 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 case PyUnicode_2BYTE_KIND:
10489 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010490 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 PyUnicode_GET_LENGTH(self), maxcount
10492 );
10493 case PyUnicode_4BYTE_KIND:
10494 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010495 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 PyUnicode_GET_LENGTH(self), maxcount
10497 );
10498 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010499 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 }
10501
10502 if (PyUnicode_READY(substring) == -1)
10503 return NULL;
10504
10505 kind1 = PyUnicode_KIND(self);
10506 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 len1 = PyUnicode_GET_LENGTH(self);
10508 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010509 if (kind1 < kind2 || len1 < len2) {
10510 out = PyList_New(1);
10511 if (out == NULL)
10512 return NULL;
10513 Py_INCREF(self);
10514 PyList_SET_ITEM(out, 0, self);
10515 return out;
10516 }
10517 buf1 = PyUnicode_DATA(self);
10518 buf2 = PyUnicode_DATA(substring);
10519 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010520 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010521 if (!buf2)
10522 return NULL;
10523 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010525 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010527 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10528 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010529 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010530 else
10531 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010532 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 break;
10534 case PyUnicode_2BYTE_KIND:
10535 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010536 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 break;
10538 case PyUnicode_4BYTE_KIND:
10539 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010540 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 break;
10542 default:
10543 out = NULL;
10544 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010545 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010546 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010547 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 return out;
10549}
10550
10551static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010552anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10553 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010555 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010557 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10558 return asciilib_find(buf1, len1, buf2, len2, offset);
10559 else
10560 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 case PyUnicode_2BYTE_KIND:
10562 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10563 case PyUnicode_4BYTE_KIND:
10564 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10565 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010566 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567}
10568
10569static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010570anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10571 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010573 switch (kind) {
10574 case PyUnicode_1BYTE_KIND:
10575 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10576 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10577 else
10578 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10579 case PyUnicode_2BYTE_KIND:
10580 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10581 case PyUnicode_4BYTE_KIND:
10582 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10583 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010584 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010585}
10586
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010587static void
10588replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10589 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10590{
10591 int kind = PyUnicode_KIND(u);
10592 void *data = PyUnicode_DATA(u);
10593 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10594 if (kind == PyUnicode_1BYTE_KIND) {
10595 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10596 (Py_UCS1 *)data + len,
10597 u1, u2, maxcount);
10598 }
10599 else if (kind == PyUnicode_2BYTE_KIND) {
10600 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10601 (Py_UCS2 *)data + len,
10602 u1, u2, maxcount);
10603 }
10604 else {
10605 assert(kind == PyUnicode_4BYTE_KIND);
10606 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10607 (Py_UCS4 *)data + len,
10608 u1, u2, maxcount);
10609 }
10610}
10611
Alexander Belopolsky40018472011-02-26 01:02:56 +000010612static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613replace(PyObject *self, PyObject *str1,
10614 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010617 const char *sbuf = PyUnicode_DATA(self);
10618 const void *buf1 = PyUnicode_DATA(str1);
10619 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 int srelease = 0, release1 = 0, release2 = 0;
10621 int skind = PyUnicode_KIND(self);
10622 int kind1 = PyUnicode_KIND(str1);
10623 int kind2 = PyUnicode_KIND(str2);
10624 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10625 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10626 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010627 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010628 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010630 if (slen < len1)
10631 goto nothing;
10632
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010634 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010635 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010636 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637
Victor Stinner59de0ee2011-10-07 10:01:28 +020010638 if (str1 == str2)
10639 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640
Victor Stinner49a0a212011-10-12 23:46:10 +020010641 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010642 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10643 if (maxchar < maxchar_str1)
10644 /* substring too wide to be present */
10645 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10647 /* Replacing str1 with str2 may cause a maxchar reduction in the
10648 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010649 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010650 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010655 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010658 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010659 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010660
Victor Stinner69ed0f42013-04-09 21:48:24 +020010661 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010662 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010663 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010665 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010667 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010669
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010670 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10671 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010672 }
10673 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 int rkind = skind;
10675 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010676 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 if (kind1 < rkind) {
10679 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010680 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 if (!buf1) goto error;
10682 release1 = 1;
10683 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010684 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010685 if (i < 0)
10686 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 if (rkind > kind2) {
10688 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010689 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (!buf2) goto error;
10691 release2 = 1;
10692 }
10693 else if (rkind < kind2) {
10694 /* widen self and buf1 */
10695 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010696 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010697 assert(buf1 != PyUnicode_DATA(str1));
10698 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010699 buf1 = PyUnicode_DATA(str1);
10700 release1 = 0;
10701 }
10702 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 if (!sbuf) goto error;
10704 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010705 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010706 if (!buf1) goto error;
10707 release1 = 1;
10708 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010709 u = PyUnicode_New(slen, maxchar);
10710 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010712 assert(PyUnicode_KIND(u) == rkind);
10713 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010714
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010715 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010716 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010717 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010719 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010721
10722 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010723 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010724 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010725 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010726 if (i == -1)
10727 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010728 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010730 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010734 }
10735 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010737 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 int rkind = skind;
10739 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010742 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010743 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 if (!buf1) goto error;
10745 release1 = 1;
10746 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010747 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010748 if (n == 0)
10749 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010751 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010752 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 if (!buf2) goto error;
10754 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010756 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010757 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010759 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 if (!sbuf) goto error;
10761 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010762 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010763 assert(buf1 != PyUnicode_DATA(str1));
10764 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010765 buf1 = PyUnicode_DATA(str1);
10766 release1 = 0;
10767 }
10768 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 if (!buf1) goto error;
10770 release1 = 1;
10771 }
10772 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10773 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010774 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 PyErr_SetString(PyExc_OverflowError,
10776 "replace string is too long");
10777 goto error;
10778 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010779 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010780 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010781 _Py_INCREF_UNICODE_EMPTY();
10782 if (!unicode_empty)
10783 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010784 u = unicode_empty;
10785 goto done;
10786 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010787 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 PyErr_SetString(PyExc_OverflowError,
10789 "replace string is too long");
10790 goto error;
10791 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010792 u = PyUnicode_New(new_size, maxchar);
10793 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010795 assert(PyUnicode_KIND(u) == rkind);
10796 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 ires = i = 0;
10798 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010799 while (n-- > 0) {
10800 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010801 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010802 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010803 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010804 if (j == -1)
10805 break;
10806 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010807 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010808 memcpy(res + rkind * ires,
10809 sbuf + rkind * i,
10810 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010812 }
10813 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010815 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010817 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010823 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010824 memcpy(res + rkind * ires,
10825 sbuf + rkind * i,
10826 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010827 }
10828 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010829 /* interleave */
10830 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010831 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010833 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010835 if (--n <= 0)
10836 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010837 memcpy(res + rkind * ires,
10838 sbuf + rkind * i,
10839 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 ires++;
10841 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010842 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010843 memcpy(res + rkind * ires,
10844 sbuf + rkind * i,
10845 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010846 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010847 }
10848
10849 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010850 unicode_adjust_maxchar(&u);
10851 if (u == NULL)
10852 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010854
10855 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010856 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10857 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10858 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010860 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010862 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010864 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010865 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010867
Benjamin Peterson29060642009-01-31 22:14:21 +000010868 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010869 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010870 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10871 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10872 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010874 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010876 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010878 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010879 return unicode_result_unchanged(self);
10880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010882 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10883 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10884 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10885 if (srelease)
10886 PyMem_FREE((void *)sbuf);
10887 if (release1)
10888 PyMem_FREE((void *)buf1);
10889 if (release2)
10890 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892}
10893
10894/* --- Unicode Object Methods --------------------------------------------- */
10895
INADA Naoki3ae20562017-01-16 20:41:20 +090010896/*[clinic input]
10897str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898
INADA Naoki3ae20562017-01-16 20:41:20 +090010899Return a version of the string where each word is titlecased.
10900
10901More specifically, words start with uppercased characters and all remaining
10902cased characters have lower case.
10903[clinic start generated code]*/
10904
10905static PyObject *
10906unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010907/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010909 if (PyUnicode_READY(self) == -1)
10910 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010911 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912}
10913
INADA Naoki3ae20562017-01-16 20:41:20 +090010914/*[clinic input]
10915str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916
INADA Naoki3ae20562017-01-16 20:41:20 +090010917Return a capitalized version of the string.
10918
10919More specifically, make the first character have upper case and the rest lower
10920case.
10921[clinic start generated code]*/
10922
10923static PyObject *
10924unicode_capitalize_impl(PyObject *self)
10925/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010927 if (PyUnicode_READY(self) == -1)
10928 return NULL;
10929 if (PyUnicode_GET_LENGTH(self) == 0)
10930 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010931 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932}
10933
INADA Naoki3ae20562017-01-16 20:41:20 +090010934/*[clinic input]
10935str.casefold as unicode_casefold
10936
10937Return a version of the string suitable for caseless comparisons.
10938[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010939
10940static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010941unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010942/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010943{
10944 if (PyUnicode_READY(self) == -1)
10945 return NULL;
10946 if (PyUnicode_IS_ASCII(self))
10947 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010948 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010949}
10950
10951
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010952/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010953
10954static int
10955convert_uc(PyObject *obj, void *addr)
10956{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010958
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010959 if (!PyUnicode_Check(obj)) {
10960 PyErr_Format(PyExc_TypeError,
10961 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010962 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010963 return 0;
10964 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010965 if (PyUnicode_READY(obj) < 0)
10966 return 0;
10967 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010968 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010969 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010970 return 0;
10971 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010972 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010973 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010974}
10975
INADA Naoki3ae20562017-01-16 20:41:20 +090010976/*[clinic input]
10977str.center as unicode_center
10978
10979 width: Py_ssize_t
10980 fillchar: Py_UCS4 = ' '
10981 /
10982
10983Return a centered string of length width.
10984
10985Padding is done using the specified fill character (default is a space).
10986[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987
10988static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010989unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10990/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010992 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993
Benjamin Petersonbac79492012-01-14 13:34:47 -050010994 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 return NULL;
10996
Victor Stinnerc4b49542011-12-11 22:44:26 +010010997 if (PyUnicode_GET_LENGTH(self) >= width)
10998 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999
Victor Stinnerc4b49542011-12-11 22:44:26 +010011000 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001 left = marg / 2 + (marg & width & 1);
11002
Victor Stinner9310abb2011-10-05 00:59:23 +020011003 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004}
11005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006/* This function assumes that str1 and str2 are readied by the caller. */
11007
Marc-André Lemburge5034372000-08-08 08:04:29 +000011008static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011009unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011010{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011011#define COMPARE(TYPE1, TYPE2) \
11012 do { \
11013 TYPE1* p1 = (TYPE1 *)data1; \
11014 TYPE2* p2 = (TYPE2 *)data2; \
11015 TYPE1* end = p1 + len; \
11016 Py_UCS4 c1, c2; \
11017 for (; p1 != end; p1++, p2++) { \
11018 c1 = *p1; \
11019 c2 = *p2; \
11020 if (c1 != c2) \
11021 return (c1 < c2) ? -1 : 1; \
11022 } \
11023 } \
11024 while (0)
11025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011027 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011028 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 kind1 = PyUnicode_KIND(str1);
11031 kind2 = PyUnicode_KIND(str2);
11032 data1 = PyUnicode_DATA(str1);
11033 data2 = PyUnicode_DATA(str2);
11034 len1 = PyUnicode_GET_LENGTH(str1);
11035 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011036 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011037
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011038 switch(kind1) {
11039 case PyUnicode_1BYTE_KIND:
11040 {
11041 switch(kind2) {
11042 case PyUnicode_1BYTE_KIND:
11043 {
11044 int cmp = memcmp(data1, data2, len);
11045 /* normalize result of memcmp() into the range [-1; 1] */
11046 if (cmp < 0)
11047 return -1;
11048 if (cmp > 0)
11049 return 1;
11050 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011051 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011052 case PyUnicode_2BYTE_KIND:
11053 COMPARE(Py_UCS1, Py_UCS2);
11054 break;
11055 case PyUnicode_4BYTE_KIND:
11056 COMPARE(Py_UCS1, Py_UCS4);
11057 break;
11058 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011059 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011060 }
11061 break;
11062 }
11063 case PyUnicode_2BYTE_KIND:
11064 {
11065 switch(kind2) {
11066 case PyUnicode_1BYTE_KIND:
11067 COMPARE(Py_UCS2, Py_UCS1);
11068 break;
11069 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011070 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011071 COMPARE(Py_UCS2, Py_UCS2);
11072 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011073 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011074 case PyUnicode_4BYTE_KIND:
11075 COMPARE(Py_UCS2, Py_UCS4);
11076 break;
11077 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011078 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011079 }
11080 break;
11081 }
11082 case PyUnicode_4BYTE_KIND:
11083 {
11084 switch(kind2) {
11085 case PyUnicode_1BYTE_KIND:
11086 COMPARE(Py_UCS4, Py_UCS1);
11087 break;
11088 case PyUnicode_2BYTE_KIND:
11089 COMPARE(Py_UCS4, Py_UCS2);
11090 break;
11091 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011092 {
11093#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11094 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11095 /* normalize result of wmemcmp() into the range [-1; 1] */
11096 if (cmp < 0)
11097 return -1;
11098 if (cmp > 0)
11099 return 1;
11100#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011101 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011102#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011103 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011104 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011105 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011106 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011107 }
11108 break;
11109 }
11110 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011111 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011112 }
11113
Victor Stinner770e19e2012-10-04 22:59:45 +020011114 if (len1 == len2)
11115 return 0;
11116 if (len1 < len2)
11117 return -1;
11118 else
11119 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011120
11121#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011122}
11123
Benjamin Peterson621b4302016-09-09 13:54:34 -070011124static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011125unicode_compare_eq(PyObject *str1, PyObject *str2)
11126{
11127 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011128 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011129 Py_ssize_t len;
11130 int cmp;
11131
Victor Stinnere5567ad2012-10-23 02:48:49 +020011132 len = PyUnicode_GET_LENGTH(str1);
11133 if (PyUnicode_GET_LENGTH(str2) != len)
11134 return 0;
11135 kind = PyUnicode_KIND(str1);
11136 if (PyUnicode_KIND(str2) != kind)
11137 return 0;
11138 data1 = PyUnicode_DATA(str1);
11139 data2 = PyUnicode_DATA(str2);
11140
11141 cmp = memcmp(data1, data2, len * kind);
11142 return (cmp == 0);
11143}
11144
11145
Alexander Belopolsky40018472011-02-26 01:02:56 +000011146int
11147PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11150 if (PyUnicode_READY(left) == -1 ||
11151 PyUnicode_READY(right) == -1)
11152 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011153
11154 /* a string is equal to itself */
11155 if (left == right)
11156 return 0;
11157
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011158 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011160 PyErr_Format(PyExc_TypeError,
11161 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011162 Py_TYPE(left)->tp_name,
11163 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164 return -1;
11165}
11166
Martin v. Löwis5b222132007-06-10 09:51:05 +000011167int
11168PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11169{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 Py_ssize_t i;
11171 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011173 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174
Victor Stinner910337b2011-10-03 03:20:16 +020011175 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011176 if (!PyUnicode_IS_READY(uni)) {
11177 const wchar_t *ws = _PyUnicode_WSTR(uni);
11178 /* Compare Unicode string and source character set string */
11179 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11180 if (chr != ustr[i])
11181 return (chr < ustr[i]) ? -1 : 1;
11182 }
11183 /* This check keeps Python strings that end in '\0' from comparing equal
11184 to C strings identical up to that point. */
11185 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11186 return 1; /* uni is longer */
11187 if (ustr[i])
11188 return -1; /* str is longer */
11189 return 0;
11190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011192 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011193 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011194 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011195 size_t len, len2 = strlen(str);
11196 int cmp;
11197
11198 len = Py_MIN(len1, len2);
11199 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011200 if (cmp != 0) {
11201 if (cmp < 0)
11202 return -1;
11203 else
11204 return 1;
11205 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011206 if (len1 > len2)
11207 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011208 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011209 return -1; /* str is longer */
11210 return 0;
11211 }
11212 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011213 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011214 /* Compare Unicode string and source character set string */
11215 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011216 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011217 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11218 /* This check keeps Python strings that end in '\0' from comparing equal
11219 to C strings identical up to that point. */
11220 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11221 return 1; /* uni is longer */
11222 if (str[i])
11223 return -1; /* str is longer */
11224 return 0;
11225 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011226}
11227
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011228static int
11229non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11230{
11231 size_t i, len;
11232 const wchar_t *p;
11233 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11234 if (strlen(str) != len)
11235 return 0;
11236 p = _PyUnicode_WSTR(unicode);
11237 assert(p);
11238 for (i = 0; i < len; i++) {
11239 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011240 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011241 return 0;
11242 }
11243 return 1;
11244}
11245
11246int
11247_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11248{
11249 size_t len;
11250 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011251 assert(str);
11252#ifndef NDEBUG
11253 for (const char *p = str; *p; p++) {
11254 assert((unsigned char)*p < 128);
11255 }
11256#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011257 if (PyUnicode_READY(unicode) == -1) {
11258 /* Memory error or bad data */
11259 PyErr_Clear();
11260 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11261 }
11262 if (!PyUnicode_IS_ASCII(unicode))
11263 return 0;
11264 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11265 return strlen(str) == len &&
11266 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11267}
11268
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011269int
11270_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11271{
11272 PyObject *right_uni;
11273 Py_hash_t hash;
11274
11275 assert(_PyUnicode_CHECK(left));
11276 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011277#ifndef NDEBUG
11278 for (const char *p = right->string; *p; p++) {
11279 assert((unsigned char)*p < 128);
11280 }
11281#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011282
11283 if (PyUnicode_READY(left) == -1) {
11284 /* memory error or bad data */
11285 PyErr_Clear();
11286 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11287 }
11288
11289 if (!PyUnicode_IS_ASCII(left))
11290 return 0;
11291
11292 right_uni = _PyUnicode_FromId(right); /* borrowed */
11293 if (right_uni == NULL) {
11294 /* memory error or bad data */
11295 PyErr_Clear();
11296 return _PyUnicode_EqualToASCIIString(left, right->string);
11297 }
11298
11299 if (left == right_uni)
11300 return 1;
11301
11302 if (PyUnicode_CHECK_INTERNED(left))
11303 return 0;
11304
INADA Naoki7cc95f52018-01-28 02:07:09 +090011305 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011306 hash = _PyUnicode_HASH(left);
11307 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11308 return 0;
11309
11310 return unicode_compare_eq(left, right_uni);
11311}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011312
Alexander Belopolsky40018472011-02-26 01:02:56 +000011313PyObject *
11314PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011315{
11316 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011317
Victor Stinnere5567ad2012-10-23 02:48:49 +020011318 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11319 Py_RETURN_NOTIMPLEMENTED;
11320
11321 if (PyUnicode_READY(left) == -1 ||
11322 PyUnicode_READY(right) == -1)
11323 return NULL;
11324
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011325 if (left == right) {
11326 switch (op) {
11327 case Py_EQ:
11328 case Py_LE:
11329 case Py_GE:
11330 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011331 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011332 case Py_NE:
11333 case Py_LT:
11334 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011335 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011336 default:
11337 PyErr_BadArgument();
11338 return NULL;
11339 }
11340 }
11341 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011342 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011343 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011344 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011345 }
11346 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011347 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011348 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011349 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011350}
11351
Alexander Belopolsky40018472011-02-26 01:02:56 +000011352int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011353_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11354{
11355 return unicode_eq(aa, bb);
11356}
11357
11358int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011359PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011360{
Victor Stinner77282cb2013-04-14 19:22:47 +020011361 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011362 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011364 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011365
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011366 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011368 "'in <string>' requires string as left operand, not %.100s",
11369 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011370 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011371 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011372 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011373 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011374 if (ensure_unicode(str) < 0)
11375 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011378 kind2 = PyUnicode_KIND(substr);
11379 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011380 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011382 len2 = PyUnicode_GET_LENGTH(substr);
11383 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011384 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011385 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011386 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011387 if (len2 == 1) {
11388 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11389 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011390 return result;
11391 }
11392 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011393 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011394 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011395 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011396 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397
Victor Stinner77282cb2013-04-14 19:22:47 +020011398 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 case PyUnicode_1BYTE_KIND:
11400 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11401 break;
11402 case PyUnicode_2BYTE_KIND:
11403 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11404 break;
11405 case PyUnicode_4BYTE_KIND:
11406 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11407 break;
11408 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011409 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011411
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011412 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011413 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011414 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415
Guido van Rossum403d68b2000-03-13 15:55:09 +000011416 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011417}
11418
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419/* Concat to string or Unicode object giving a new Unicode object. */
11420
Alexander Belopolsky40018472011-02-26 01:02:56 +000011421PyObject *
11422PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011424 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011425 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011426 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011428 if (ensure_unicode(left) < 0)
11429 return NULL;
11430
11431 if (!PyUnicode_Check(right)) {
11432 PyErr_Format(PyExc_TypeError,
11433 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011434 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011435 return NULL;
11436 }
11437 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
11440 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011441 if (left == unicode_empty)
11442 return PyUnicode_FromObject(right);
11443 if (right == unicode_empty)
11444 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011446 left_len = PyUnicode_GET_LENGTH(left);
11447 right_len = PyUnicode_GET_LENGTH(right);
11448 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011449 PyErr_SetString(PyExc_OverflowError,
11450 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011451 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011452 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011453 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011454
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011455 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11456 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011457 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011460 result = PyUnicode_New(new_len, maxchar);
11461 if (result == NULL)
11462 return NULL;
11463 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11464 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11465 assert(_PyUnicode_CheckConsistency(result, 1));
11466 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467}
11468
Walter Dörwald1ab83302007-05-18 17:15:44 +000011469void
Victor Stinner23e56682011-10-03 03:54:37 +020011470PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011471{
Victor Stinner23e56682011-10-03 03:54:37 +020011472 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011473 Py_UCS4 maxchar, maxchar2;
11474 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011475
11476 if (p_left == NULL) {
11477 if (!PyErr_Occurred())
11478 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011479 return;
11480 }
Victor Stinner23e56682011-10-03 03:54:37 +020011481 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011482 if (right == NULL || left == NULL
11483 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011484 if (!PyErr_Occurred())
11485 PyErr_BadInternalCall();
11486 goto error;
11487 }
11488
Benjamin Petersonbac79492012-01-14 13:34:47 -050011489 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011490 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011491 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011492 goto error;
11493
Victor Stinner488fa492011-12-12 00:01:39 +010011494 /* Shortcuts */
11495 if (left == unicode_empty) {
11496 Py_DECREF(left);
11497 Py_INCREF(right);
11498 *p_left = right;
11499 return;
11500 }
11501 if (right == unicode_empty)
11502 return;
11503
11504 left_len = PyUnicode_GET_LENGTH(left);
11505 right_len = PyUnicode_GET_LENGTH(right);
11506 if (left_len > PY_SSIZE_T_MAX - right_len) {
11507 PyErr_SetString(PyExc_OverflowError,
11508 "strings are too large to concat");
11509 goto error;
11510 }
11511 new_len = left_len + right_len;
11512
11513 if (unicode_modifiable(left)
11514 && PyUnicode_CheckExact(right)
11515 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011516 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11517 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011518 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011519 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011520 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11521 {
11522 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011523 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011524 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011525
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011526 /* copy 'right' into the newly allocated area of 'left' */
11527 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011528 }
Victor Stinner488fa492011-12-12 00:01:39 +010011529 else {
11530 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11531 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011532 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011533
Victor Stinner488fa492011-12-12 00:01:39 +010011534 /* Concat the two Unicode strings */
11535 res = PyUnicode_New(new_len, maxchar);
11536 if (res == NULL)
11537 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011538 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11539 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011540 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011541 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011542 }
11543 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011544 return;
11545
11546error:
Victor Stinner488fa492011-12-12 00:01:39 +010011547 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011548}
11549
11550void
11551PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11552{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011553 PyUnicode_Append(pleft, right);
11554 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011555}
11556
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011557/*
11558Wraps stringlib_parse_args_finds() and additionally ensures that the
11559first argument is a unicode object.
11560*/
11561
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011562static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011563parse_args_finds_unicode(const char * function_name, PyObject *args,
11564 PyObject **substring,
11565 Py_ssize_t *start, Py_ssize_t *end)
11566{
11567 if(stringlib_parse_args_finds(function_name, args, substring,
11568 start, end)) {
11569 if (ensure_unicode(*substring) < 0)
11570 return 0;
11571 return 1;
11572 }
11573 return 0;
11574}
11575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011576PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011579Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011580string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011581interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
11583static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011584unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011586 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011587 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011588 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011590 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011591 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011594 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011595 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011597 kind1 = PyUnicode_KIND(self);
11598 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011599 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011600 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 len1 = PyUnicode_GET_LENGTH(self);
11603 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011605 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011606 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011607
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011608 buf1 = PyUnicode_DATA(self);
11609 buf2 = PyUnicode_DATA(substring);
11610 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011611 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011612 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011613 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011614 }
11615 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 case PyUnicode_1BYTE_KIND:
11617 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011618 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 buf2, len2, PY_SSIZE_T_MAX
11620 );
11621 break;
11622 case PyUnicode_2BYTE_KIND:
11623 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011624 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 buf2, len2, PY_SSIZE_T_MAX
11626 );
11627 break;
11628 case PyUnicode_4BYTE_KIND:
11629 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011630 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 buf2, len2, PY_SSIZE_T_MAX
11632 );
11633 break;
11634 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011635 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 }
11637
11638 result = PyLong_FromSsize_t(iresult);
11639
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011640 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011641 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011642 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644 return result;
11645}
11646
INADA Naoki3ae20562017-01-16 20:41:20 +090011647/*[clinic input]
11648str.encode as unicode_encode
11649
11650 encoding: str(c_default="NULL") = 'utf-8'
11651 The encoding in which to encode the string.
11652 errors: str(c_default="NULL") = 'strict'
11653 The error handling scheme to use for encoding errors.
11654 The default is 'strict' meaning that encoding errors raise a
11655 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11656 'xmlcharrefreplace' as well as any other name registered with
11657 codecs.register_error that can handle UnicodeEncodeErrors.
11658
11659Encode the string using the codec registered for encoding.
11660[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661
11662static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011663unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011664/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011666 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011667}
11668
INADA Naoki3ae20562017-01-16 20:41:20 +090011669/*[clinic input]
11670str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671
INADA Naoki3ae20562017-01-16 20:41:20 +090011672 tabsize: int = 8
11673
11674Return a copy where all tab characters are expanded using spaces.
11675
11676If tabsize is not given, a tab size of 8 characters is assumed.
11677[clinic start generated code]*/
11678
11679static PyObject *
11680unicode_expandtabs_impl(PyObject *self, int tabsize)
11681/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011683 Py_ssize_t i, j, line_pos, src_len, incr;
11684 Py_UCS4 ch;
11685 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011686 const void *src_data;
11687 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011688 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011689 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690
Antoine Pitrou22425222011-10-04 19:10:51 +020011691 if (PyUnicode_READY(self) == -1)
11692 return NULL;
11693
Thomas Wouters7e474022000-07-16 12:04:32 +000011694 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011695 src_len = PyUnicode_GET_LENGTH(self);
11696 i = j = line_pos = 0;
11697 kind = PyUnicode_KIND(self);
11698 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011699 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011700 for (; i < src_len; i++) {
11701 ch = PyUnicode_READ(kind, src_data, i);
11702 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011703 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011704 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011705 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011707 goto overflow;
11708 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011709 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011710 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011714 goto overflow;
11715 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011717 if (ch == '\n' || ch == '\r')
11718 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011720 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011721 if (!found)
11722 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011723
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011725 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726 if (!u)
11727 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011728 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
Antoine Pitroue71d5742011-10-04 15:55:09 +020011730 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
Antoine Pitroue71d5742011-10-04 15:55:09 +020011732 for (; i < src_len; i++) {
11733 ch = PyUnicode_READ(kind, src_data, i);
11734 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011735 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011736 incr = tabsize - (line_pos % tabsize);
11737 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011738 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011739 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011741 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011743 line_pos++;
11744 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011745 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011746 if (ch == '\n' || ch == '\r')
11747 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011749 }
11750 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011751 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011752
Antoine Pitroue71d5742011-10-04 15:55:09 +020011753 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011754 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11755 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756}
11757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011758PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760\n\
11761Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011762such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763arguments start and end are interpreted as in slice notation.\n\
11764\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011765Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
11767static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011770 /* initialize variables to prevent gcc warning */
11771 PyObject *substring = NULL;
11772 Py_ssize_t start = 0;
11773 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011774 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011776 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011779 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011782 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 if (result == -2)
11785 return NULL;
11786
Christian Heimes217cfd12007-12-02 14:31:20 +000011787 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788}
11789
11790static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011791unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011793 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011794 enum PyUnicode_Kind kind;
11795 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011796
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011797 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011798 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011800 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011801 if (PyUnicode_READY(self) == -1) {
11802 return NULL;
11803 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011804 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11805 PyErr_SetString(PyExc_IndexError, "string index out of range");
11806 return NULL;
11807 }
11808 kind = PyUnicode_KIND(self);
11809 data = PyUnicode_DATA(self);
11810 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011811 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812}
11813
Guido van Rossumc2504932007-09-18 19:42:40 +000011814/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011815 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011816static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011817unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011819 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011820
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011821#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011822 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011823#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 if (_PyUnicode_HASH(self) != -1)
11825 return _PyUnicode_HASH(self);
11826 if (PyUnicode_READY(self) == -1)
11827 return -1;
animalizea1d14252019-01-02 20:16:06 +080011828
Christian Heimes985ecdc2013-11-20 11:46:18 +010011829 x = _Py_HashBytes(PyUnicode_DATA(self),
11830 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011832 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833}
11834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011835PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837\n\
oldkaa0735f2018-02-02 16:52:55 +080011838Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011839such that sub is contained within S[start:end]. Optional\n\
11840arguments start and end are interpreted as in slice notation.\n\
11841\n\
11842Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843
11844static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011847 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011848 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011849 PyObject *substring = NULL;
11850 Py_ssize_t start = 0;
11851 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011853 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011856 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011859 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 if (result == -2)
11862 return NULL;
11863
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864 if (result < 0) {
11865 PyErr_SetString(PyExc_ValueError, "substring not found");
11866 return NULL;
11867 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011868
Christian Heimes217cfd12007-12-02 14:31:20 +000011869 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870}
11871
INADA Naoki3ae20562017-01-16 20:41:20 +090011872/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011873str.isascii as unicode_isascii
11874
11875Return True if all characters in the string are ASCII, False otherwise.
11876
11877ASCII characters have code points in the range U+0000-U+007F.
11878Empty string is ASCII too.
11879[clinic start generated code]*/
11880
11881static PyObject *
11882unicode_isascii_impl(PyObject *self)
11883/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11884{
11885 if (PyUnicode_READY(self) == -1) {
11886 return NULL;
11887 }
11888 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11889}
11890
11891/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011892str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
INADA Naoki3ae20562017-01-16 20:41:20 +090011894Return True if the string is a lowercase string, False otherwise.
11895
11896A string is lowercase if all cased characters in the string are lowercase and
11897there is at least one cased character in the string.
11898[clinic start generated code]*/
11899
11900static PyObject *
11901unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011902/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 Py_ssize_t i, length;
11905 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011906 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 int cased;
11908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 if (PyUnicode_READY(self) == -1)
11910 return NULL;
11911 length = PyUnicode_GET_LENGTH(self);
11912 kind = PyUnicode_KIND(self);
11913 data = PyUnicode_DATA(self);
11914
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 if (length == 1)
11917 return PyBool_FromLong(
11918 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011920 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011922 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011923
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 for (i = 0; i < length; i++) {
11926 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011927
Benjamin Peterson29060642009-01-31 22:14:21 +000011928 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011929 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 else if (!cased && Py_UNICODE_ISLOWER(ch))
11931 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011933 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934}
11935
INADA Naoki3ae20562017-01-16 20:41:20 +090011936/*[clinic input]
11937str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938
INADA Naoki3ae20562017-01-16 20:41:20 +090011939Return True if the string is an uppercase string, False otherwise.
11940
11941A string is uppercase if all cased characters in the string are uppercase and
11942there is at least one cased character in the string.
11943[clinic start generated code]*/
11944
11945static PyObject *
11946unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011947/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 Py_ssize_t i, length;
11950 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011951 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952 int cased;
11953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 if (PyUnicode_READY(self) == -1)
11955 return NULL;
11956 length = PyUnicode_GET_LENGTH(self);
11957 kind = PyUnicode_KIND(self);
11958 data = PyUnicode_DATA(self);
11959
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 if (length == 1)
11962 return PyBool_FromLong(
11963 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011965 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011967 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011968
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 for (i = 0; i < length; i++) {
11971 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011972
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011974 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 else if (!cased && Py_UNICODE_ISUPPER(ch))
11976 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011978 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979}
11980
INADA Naoki3ae20562017-01-16 20:41:20 +090011981/*[clinic input]
11982str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983
INADA Naoki3ae20562017-01-16 20:41:20 +090011984Return True if the string is a title-cased string, False otherwise.
11985
11986In a title-cased string, upper- and title-case characters may only
11987follow uncased characters and lowercase characters only cased ones.
11988[clinic start generated code]*/
11989
11990static PyObject *
11991unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011992/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 Py_ssize_t i, length;
11995 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011996 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997 int cased, previous_is_cased;
11998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 if (PyUnicode_READY(self) == -1)
12000 return NULL;
12001 length = PyUnicode_GET_LENGTH(self);
12002 kind = PyUnicode_KIND(self);
12003 data = PyUnicode_DATA(self);
12004
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 if (length == 1) {
12007 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12008 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12009 (Py_UNICODE_ISUPPER(ch) != 0));
12010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012012 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012014 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012015
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016 cased = 0;
12017 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 for (i = 0; i < length; i++) {
12019 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012020
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12022 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012023 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 previous_is_cased = 1;
12025 cased = 1;
12026 }
12027 else if (Py_UNICODE_ISLOWER(ch)) {
12028 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012029 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 previous_is_cased = 1;
12031 cased = 1;
12032 }
12033 else
12034 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012036 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037}
12038
INADA Naoki3ae20562017-01-16 20:41:20 +090012039/*[clinic input]
12040str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041
INADA Naoki3ae20562017-01-16 20:41:20 +090012042Return True if the string is a whitespace string, False otherwise.
12043
12044A string is whitespace if all characters in the string are whitespace and there
12045is at least one character in the string.
12046[clinic start generated code]*/
12047
12048static PyObject *
12049unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012050/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 Py_ssize_t i, length;
12053 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012054 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055
12056 if (PyUnicode_READY(self) == -1)
12057 return NULL;
12058 length = PyUnicode_GET_LENGTH(self);
12059 kind = PyUnicode_KIND(self);
12060 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 if (length == 1)
12064 return PyBool_FromLong(
12065 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012067 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012069 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 for (i = 0; i < length; i++) {
12072 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012073 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012074 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012076 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077}
12078
INADA Naoki3ae20562017-01-16 20:41:20 +090012079/*[clinic input]
12080str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012081
INADA Naoki3ae20562017-01-16 20:41:20 +090012082Return True if the string is an alphabetic string, False otherwise.
12083
12084A string is alphabetic if all characters in the string are alphabetic and there
12085is at least one character in the string.
12086[clinic start generated code]*/
12087
12088static PyObject *
12089unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012090/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 Py_ssize_t i, length;
12093 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012094 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095
12096 if (PyUnicode_READY(self) == -1)
12097 return NULL;
12098 length = PyUnicode_GET_LENGTH(self);
12099 kind = PyUnicode_KIND(self);
12100 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012101
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012102 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 if (length == 1)
12104 return PyBool_FromLong(
12105 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012106
12107 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012109 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 for (i = 0; i < length; i++) {
12112 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012113 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012114 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012115 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012116}
12117
INADA Naoki3ae20562017-01-16 20:41:20 +090012118/*[clinic input]
12119str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012120
INADA Naoki3ae20562017-01-16 20:41:20 +090012121Return True if the string is an alpha-numeric string, False otherwise.
12122
12123A string is alpha-numeric if all characters in the string are alpha-numeric and
12124there is at least one character in the string.
12125[clinic start generated code]*/
12126
12127static PyObject *
12128unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012129/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012132 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 Py_ssize_t len, i;
12134
12135 if (PyUnicode_READY(self) == -1)
12136 return NULL;
12137
12138 kind = PyUnicode_KIND(self);
12139 data = PyUnicode_DATA(self);
12140 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012141
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012142 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 if (len == 1) {
12144 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12145 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12146 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012147
12148 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012150 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 for (i = 0; i < len; i++) {
12153 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012154 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012155 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012156 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012157 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012158}
12159
INADA Naoki3ae20562017-01-16 20:41:20 +090012160/*[clinic input]
12161str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162
INADA Naoki3ae20562017-01-16 20:41:20 +090012163Return True if the string is a decimal string, False otherwise.
12164
12165A string is a decimal string if all characters in the string are decimal and
12166there is at least one character in the string.
12167[clinic start generated code]*/
12168
12169static PyObject *
12170unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012171/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 Py_ssize_t i, length;
12174 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012175 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176
12177 if (PyUnicode_READY(self) == -1)
12178 return NULL;
12179 length = PyUnicode_GET_LENGTH(self);
12180 kind = PyUnicode_KIND(self);
12181 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 if (length == 1)
12185 return PyBool_FromLong(
12186 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012188 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012190 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 for (i = 0; i < length; i++) {
12193 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012194 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012196 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197}
12198
INADA Naoki3ae20562017-01-16 20:41:20 +090012199/*[clinic input]
12200str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
INADA Naoki3ae20562017-01-16 20:41:20 +090012202Return True if the string is a digit string, False otherwise.
12203
12204A string is a digit string if all characters in the string are digits and there
12205is at least one character in the string.
12206[clinic start generated code]*/
12207
12208static PyObject *
12209unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012210/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 Py_ssize_t i, length;
12213 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012214 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215
12216 if (PyUnicode_READY(self) == -1)
12217 return NULL;
12218 length = PyUnicode_GET_LENGTH(self);
12219 kind = PyUnicode_KIND(self);
12220 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 if (length == 1) {
12224 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12225 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012228 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012230 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 for (i = 0; i < length; i++) {
12233 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012234 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012236 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237}
12238
INADA Naoki3ae20562017-01-16 20:41:20 +090012239/*[clinic input]
12240str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241
INADA Naoki3ae20562017-01-16 20:41:20 +090012242Return True if the string is a numeric string, False otherwise.
12243
12244A string is numeric if all characters in the string are numeric and there is at
12245least one character in the string.
12246[clinic start generated code]*/
12247
12248static PyObject *
12249unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012250/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 Py_ssize_t i, length;
12253 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012254 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255
12256 if (PyUnicode_READY(self) == -1)
12257 return NULL;
12258 length = PyUnicode_GET_LENGTH(self);
12259 kind = PyUnicode_KIND(self);
12260 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 if (length == 1)
12264 return PyBool_FromLong(
12265 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012267 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012269 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 for (i = 0; i < length; i++) {
12272 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012273 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012275 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276}
12277
Martin v. Löwis47383402007-08-15 07:32:56 +000012278int
12279PyUnicode_IsIdentifier(PyObject *self)
12280{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 Py_ssize_t i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012282 int ready = PyUnicode_IS_READY(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012283
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012284 Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
12285 if (len == 0) {
12286 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 }
12289
Hai Shi3d235f52020-02-17 21:41:15 +080012290 int kind = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012291 const void *data = NULL;
Andy Lester933fc53f2020-02-20 22:51:47 -060012292 const wchar_t *wstr = NULL;
12293 Py_UCS4 ch;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012294 if (ready) {
12295 kind = PyUnicode_KIND(self);
12296 data = PyUnicode_DATA(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012297 ch = PyUnicode_READ(kind, data, 0);
12298 }
12299 else {
Andy Lester933fc53f2020-02-20 22:51:47 -060012300 wstr = _PyUnicode_WSTR(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012301 ch = wstr[0];
12302 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012303 /* PEP 3131 says that the first character must be in
12304 XID_Start and subsequent characters in XID_Continue,
12305 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012306 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012307 letters, digits, underscore). However, given the current
12308 definition of XID_Start and XID_Continue, it is sufficient
12309 to check just for these, except that _ must be allowed
12310 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012311 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012312 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012313 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012314
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012315 for (i = 1; i < len; i++) {
12316 if (ready) {
12317 ch = PyUnicode_READ(kind, data, i);
12318 }
12319 else {
12320 ch = wstr[i];
12321 }
12322 if (!_PyUnicode_IsXidContinue(ch)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012324 }
12325 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012326 return 1;
12327}
12328
INADA Naoki3ae20562017-01-16 20:41:20 +090012329/*[clinic input]
12330str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012331
INADA Naoki3ae20562017-01-16 20:41:20 +090012332Return True if the string is a valid Python identifier, False otherwise.
12333
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012334Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012335such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012336[clinic start generated code]*/
12337
12338static PyObject *
12339unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012340/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012341{
12342 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12343}
12344
INADA Naoki3ae20562017-01-16 20:41:20 +090012345/*[clinic input]
12346str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012347
INADA Naoki3ae20562017-01-16 20:41:20 +090012348Return True if the string is printable, False otherwise.
12349
12350A string is printable if all of its characters are considered printable in
12351repr() or if it is empty.
12352[clinic start generated code]*/
12353
12354static PyObject *
12355unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012356/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 Py_ssize_t i, length;
12359 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012360 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361
12362 if (PyUnicode_READY(self) == -1)
12363 return NULL;
12364 length = PyUnicode_GET_LENGTH(self);
12365 kind = PyUnicode_KIND(self);
12366 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012367
12368 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 if (length == 1)
12370 return PyBool_FromLong(
12371 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 for (i = 0; i < length; i++) {
12374 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012375 Py_RETURN_FALSE;
12376 }
12377 }
12378 Py_RETURN_TRUE;
12379}
12380
INADA Naoki3ae20562017-01-16 20:41:20 +090012381/*[clinic input]
12382str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383
INADA Naoki3ae20562017-01-16 20:41:20 +090012384 iterable: object
12385 /
12386
12387Concatenate any number of strings.
12388
Martin Panter91a88662017-01-24 00:30:06 +000012389The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012390The result is returned as a new string.
12391
12392Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12393[clinic start generated code]*/
12394
12395static PyObject *
12396unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012397/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012398{
INADA Naoki3ae20562017-01-16 20:41:20 +090012399 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400}
12401
Martin v. Löwis18e16552006-02-15 17:27:45 +000012402static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012403unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 if (PyUnicode_READY(self) == -1)
12406 return -1;
12407 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408}
12409
INADA Naoki3ae20562017-01-16 20:41:20 +090012410/*[clinic input]
12411str.ljust as unicode_ljust
12412
12413 width: Py_ssize_t
12414 fillchar: Py_UCS4 = ' '
12415 /
12416
12417Return a left-justified string of length width.
12418
12419Padding is done using the specified fill character (default is a space).
12420[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421
12422static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012423unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12424/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012426 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428
Victor Stinnerc4b49542011-12-11 22:44:26 +010012429 if (PyUnicode_GET_LENGTH(self) >= width)
12430 return unicode_result_unchanged(self);
12431
12432 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012433}
12434
INADA Naoki3ae20562017-01-16 20:41:20 +090012435/*[clinic input]
12436str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437
INADA Naoki3ae20562017-01-16 20:41:20 +090012438Return a copy of the string converted to lowercase.
12439[clinic start generated code]*/
12440
12441static PyObject *
12442unicode_lower_impl(PyObject *self)
12443/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012445 if (PyUnicode_READY(self) == -1)
12446 return NULL;
12447 if (PyUnicode_IS_ASCII(self))
12448 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012449 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450}
12451
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012452#define LEFTSTRIP 0
12453#define RIGHTSTRIP 1
12454#define BOTHSTRIP 2
12455
12456/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012457static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458
INADA Naoki3ae20562017-01-16 20:41:20 +090012459#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012461/* externally visible for str.strip(unicode) */
12462PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012463_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012464{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012465 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 int kind;
12467 Py_ssize_t i, j, len;
12468 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012469 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12472 return NULL;
12473
12474 kind = PyUnicode_KIND(self);
12475 data = PyUnicode_DATA(self);
12476 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012477 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12479 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012480 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012481
Benjamin Peterson14339b62009-01-31 16:36:08 +000012482 i = 0;
12483 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012484 while (i < len) {
12485 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12486 if (!BLOOM(sepmask, ch))
12487 break;
12488 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12489 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012490 i++;
12491 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012492 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012493
Benjamin Peterson14339b62009-01-31 16:36:08 +000012494 j = len;
12495 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012496 j--;
12497 while (j >= i) {
12498 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12499 if (!BLOOM(sepmask, ch))
12500 break;
12501 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12502 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012504 }
12505
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012507 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012508
Victor Stinner7931d9a2011-11-04 00:22:48 +010012509 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012510}
12511
12512PyObject*
12513PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12514{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012515 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012517 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518
Victor Stinnerde636f32011-10-01 03:55:54 +020012519 if (PyUnicode_READY(self) == -1)
12520 return NULL;
12521
Victor Stinner684d5fd2012-05-03 02:32:34 +020012522 length = PyUnicode_GET_LENGTH(self);
12523 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012524
Victor Stinner684d5fd2012-05-03 02:32:34 +020012525 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012526 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527
Victor Stinnerde636f32011-10-01 03:55:54 +020012528 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012529 PyErr_SetString(PyExc_IndexError, "string index out of range");
12530 return NULL;
12531 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012532 if (start >= length || end < start)
12533 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012534
Victor Stinner684d5fd2012-05-03 02:32:34 +020012535 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012536 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012537 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012538 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012539 }
12540 else {
12541 kind = PyUnicode_KIND(self);
12542 data = PyUnicode_1BYTE_DATA(self);
12543 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012544 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012545 length);
12546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548
12549static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012550do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 Py_ssize_t len, i, j;
12553
12554 if (PyUnicode_READY(self) == -1)
12555 return NULL;
12556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012558
Victor Stinnercc7af722013-04-09 22:39:24 +020012559 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012560 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012561
12562 i = 0;
12563 if (striptype != RIGHTSTRIP) {
12564 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012565 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012566 if (!_Py_ascii_whitespace[ch])
12567 break;
12568 i++;
12569 }
12570 }
12571
12572 j = len;
12573 if (striptype != LEFTSTRIP) {
12574 j--;
12575 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012576 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012577 if (!_Py_ascii_whitespace[ch])
12578 break;
12579 j--;
12580 }
12581 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012582 }
12583 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012584 else {
12585 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012586 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012587
Victor Stinnercc7af722013-04-09 22:39:24 +020012588 i = 0;
12589 if (striptype != RIGHTSTRIP) {
12590 while (i < len) {
12591 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12592 if (!Py_UNICODE_ISSPACE(ch))
12593 break;
12594 i++;
12595 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012596 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012597
12598 j = len;
12599 if (striptype != LEFTSTRIP) {
12600 j--;
12601 while (j >= i) {
12602 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12603 if (!Py_UNICODE_ISSPACE(ch))
12604 break;
12605 j--;
12606 }
12607 j++;
12608 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012609 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012610
Victor Stinner7931d9a2011-11-04 00:22:48 +010012611 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612}
12613
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012614
12615static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012616do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012617{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012618 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012619 if (PyUnicode_Check(sep))
12620 return _PyUnicode_XStrip(self, striptype, sep);
12621 else {
12622 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012623 "%s arg must be None or str",
12624 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012625 return NULL;
12626 }
12627 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012628
Benjamin Peterson14339b62009-01-31 16:36:08 +000012629 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012630}
12631
12632
INADA Naoki3ae20562017-01-16 20:41:20 +090012633/*[clinic input]
12634str.strip as unicode_strip
12635
12636 chars: object = None
12637 /
12638
Zachary Ware09895c22019-10-09 16:09:00 -050012639Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012640
12641If chars is given and not None, remove characters in chars instead.
12642[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012643
12644static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012645unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012646/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012647{
INADA Naoki3ae20562017-01-16 20:41:20 +090012648 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012649}
12650
12651
INADA Naoki3ae20562017-01-16 20:41:20 +090012652/*[clinic input]
12653str.lstrip as unicode_lstrip
12654
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012655 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012656 /
12657
12658Return a copy of the string with leading whitespace removed.
12659
12660If chars is given and not None, remove characters in chars instead.
12661[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012662
12663static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012664unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012665/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012666{
INADA Naoki3ae20562017-01-16 20:41:20 +090012667 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012668}
12669
12670
INADA Naoki3ae20562017-01-16 20:41:20 +090012671/*[clinic input]
12672str.rstrip as unicode_rstrip
12673
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012674 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012675 /
12676
12677Return a copy of the string with trailing whitespace removed.
12678
12679If chars is given and not None, remove characters in chars instead.
12680[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012681
12682static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012683unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012684/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012685{
INADA Naoki3ae20562017-01-16 20:41:20 +090012686 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012687}
12688
12689
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012691unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012693 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695
Serhiy Storchaka05997252013-01-26 12:14:02 +020012696 if (len < 1)
12697 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698
Victor Stinnerc4b49542011-12-11 22:44:26 +010012699 /* no repeat, return original string */
12700 if (len == 1)
12701 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012702
Benjamin Petersonbac79492012-01-14 13:34:47 -050012703 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 return NULL;
12705
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012706 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012707 PyErr_SetString(PyExc_OverflowError,
12708 "repeated string is too long");
12709 return NULL;
12710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012712
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012713 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714 if (!u)
12715 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012716 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012719 int kind = PyUnicode_KIND(str);
12720 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012721 if (kind == PyUnicode_1BYTE_KIND) {
12722 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012723 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012724 }
12725 else if (kind == PyUnicode_2BYTE_KIND) {
12726 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012727 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012728 ucs2[n] = fill_char;
12729 } else {
12730 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12731 assert(kind == PyUnicode_4BYTE_KIND);
12732 for (n = 0; n < len; ++n)
12733 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012734 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 }
12736 else {
12737 /* number of characters copied this far */
12738 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012739 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012741 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012743 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012745 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012746 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748 }
12749
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012750 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012751 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752}
12753
Alexander Belopolsky40018472011-02-26 01:02:56 +000012754PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012755PyUnicode_Replace(PyObject *str,
12756 PyObject *substr,
12757 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012758 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012760 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12761 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012762 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012763 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764}
12765
INADA Naoki3ae20562017-01-16 20:41:20 +090012766/*[clinic input]
12767str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768
INADA Naoki3ae20562017-01-16 20:41:20 +090012769 old: unicode
12770 new: unicode
12771 count: Py_ssize_t = -1
12772 Maximum number of occurrences to replace.
12773 -1 (the default value) means replace all occurrences.
12774 /
12775
12776Return a copy with all occurrences of substring old replaced by new.
12777
12778If the optional argument count is given, only the first count occurrences are
12779replaced.
12780[clinic start generated code]*/
12781
12782static PyObject *
12783unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12784 Py_ssize_t count)
12785/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012787 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012788 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012789 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790}
12791
sweeneydea81849b2020-04-22 17:05:48 -040012792/*[clinic input]
12793str.removeprefix as unicode_removeprefix
12794
12795 prefix: unicode
12796 /
12797
12798Return a str with the given prefix string removed if present.
12799
12800If the string starts with the prefix string, return string[len(prefix):].
12801Otherwise, return a copy of the original string.
12802[clinic start generated code]*/
12803
12804static PyObject *
12805unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12806/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12807{
12808 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12809 if (match == -1) {
12810 return NULL;
12811 }
12812 if (match) {
12813 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12814 PyUnicode_GET_LENGTH(self));
12815 }
12816 return unicode_result_unchanged(self);
12817}
12818
12819/*[clinic input]
12820str.removesuffix as unicode_removesuffix
12821
12822 suffix: unicode
12823 /
12824
12825Return a str with the given suffix string removed if present.
12826
12827If the string ends with the suffix string and that suffix is not empty,
12828return string[:-len(suffix)]. Otherwise, return a copy of the original
12829string.
12830[clinic start generated code]*/
12831
12832static PyObject *
12833unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12834/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12835{
12836 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12837 if (match == -1) {
12838 return NULL;
12839 }
12840 if (match) {
12841 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12842 - PyUnicode_GET_LENGTH(suffix));
12843 }
12844 return unicode_result_unchanged(self);
12845}
12846
Alexander Belopolsky40018472011-02-26 01:02:56 +000012847static PyObject *
12848unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012850 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 Py_ssize_t isize;
12852 Py_ssize_t osize, squote, dquote, i, o;
12853 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012854 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012855 const void *idata;
12856 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012858 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012859 return NULL;
12860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861 isize = PyUnicode_GET_LENGTH(unicode);
12862 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012864 /* Compute length of output, quote characters, and
12865 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012866 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 max = 127;
12868 squote = dquote = 0;
12869 ikind = PyUnicode_KIND(unicode);
12870 for (i = 0; i < isize; i++) {
12871 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012872 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012874 case '\'': squote++; break;
12875 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012877 incr = 2;
12878 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 default:
12880 /* Fast-path ASCII */
12881 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012882 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012883 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012884 ;
12885 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012887 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012888 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012889 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012890 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012892 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012894 if (osize > PY_SSIZE_T_MAX - incr) {
12895 PyErr_SetString(PyExc_OverflowError,
12896 "string is too long to generate repr");
12897 return NULL;
12898 }
12899 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012900 }
12901
12902 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012903 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012905 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012906 if (dquote)
12907 /* Both squote and dquote present. Use squote,
12908 and escape them */
12909 osize += squote;
12910 else
12911 quote = '"';
12912 }
Victor Stinner55c08782013-04-14 18:45:39 +020012913 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914
12915 repr = PyUnicode_New(osize, max);
12916 if (repr == NULL)
12917 return NULL;
12918 okind = PyUnicode_KIND(repr);
12919 odata = PyUnicode_DATA(repr);
12920
12921 PyUnicode_WRITE(okind, odata, 0, quote);
12922 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012923 if (unchanged) {
12924 _PyUnicode_FastCopyCharacters(repr, 1,
12925 unicode, 0,
12926 isize);
12927 }
12928 else {
12929 for (i = 0, o = 1; i < isize; i++) {
12930 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931
Victor Stinner55c08782013-04-14 18:45:39 +020012932 /* Escape quotes and backslashes */
12933 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012934 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012936 continue;
12937 }
12938
12939 /* Map special whitespace to '\t', \n', '\r' */
12940 if (ch == '\t') {
12941 PyUnicode_WRITE(okind, odata, o++, '\\');
12942 PyUnicode_WRITE(okind, odata, o++, 't');
12943 }
12944 else if (ch == '\n') {
12945 PyUnicode_WRITE(okind, odata, o++, '\\');
12946 PyUnicode_WRITE(okind, odata, o++, 'n');
12947 }
12948 else if (ch == '\r') {
12949 PyUnicode_WRITE(okind, odata, o++, '\\');
12950 PyUnicode_WRITE(okind, odata, o++, 'r');
12951 }
12952
12953 /* Map non-printable US ASCII to '\xhh' */
12954 else if (ch < ' ' || ch == 0x7F) {
12955 PyUnicode_WRITE(okind, odata, o++, '\\');
12956 PyUnicode_WRITE(okind, odata, o++, 'x');
12957 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12958 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12959 }
12960
12961 /* Copy ASCII characters as-is */
12962 else if (ch < 0x7F) {
12963 PyUnicode_WRITE(okind, odata, o++, ch);
12964 }
12965
12966 /* Non-ASCII characters */
12967 else {
12968 /* Map Unicode whitespace and control characters
12969 (categories Z* and C* except ASCII space)
12970 */
12971 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12972 PyUnicode_WRITE(okind, odata, o++, '\\');
12973 /* Map 8-bit characters to '\xhh' */
12974 if (ch <= 0xff) {
12975 PyUnicode_WRITE(okind, odata, o++, 'x');
12976 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12977 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12978 }
12979 /* Map 16-bit characters to '\uxxxx' */
12980 else if (ch <= 0xffff) {
12981 PyUnicode_WRITE(okind, odata, o++, 'u');
12982 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12983 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12984 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12985 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12986 }
12987 /* Map 21-bit characters to '\U00xxxxxx' */
12988 else {
12989 PyUnicode_WRITE(okind, odata, o++, 'U');
12990 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12991 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12992 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12993 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12994 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12995 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12996 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12997 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12998 }
12999 }
13000 /* Copy characters as-is */
13001 else {
13002 PyUnicode_WRITE(okind, odata, o++, ch);
13003 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013004 }
13005 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013008 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013009 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013010}
13011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013012PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013013 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013014\n\
13015Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013016such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017arguments start and end are interpreted as in slice notation.\n\
13018\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013019Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020
13021static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013024 /* initialize variables to prevent gcc warning */
13025 PyObject *substring = NULL;
13026 Py_ssize_t start = 0;
13027 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013028 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013029
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013030 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013031 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013033 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013034 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013036 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 if (result == -2)
13039 return NULL;
13040
Christian Heimes217cfd12007-12-02 14:31:20 +000013041 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013042}
13043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013044PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013045 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013047Return the highest index in S where substring sub is found,\n\
13048such that sub is contained within S[start:end]. Optional\n\
13049arguments start and end are interpreted as in slice notation.\n\
13050\n\
13051Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052
13053static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013055{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013056 /* initialize variables to prevent gcc warning */
13057 PyObject *substring = NULL;
13058 Py_ssize_t start = 0;
13059 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013060 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013062 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013063 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013065 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013067
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013068 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070 if (result == -2)
13071 return NULL;
13072
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073 if (result < 0) {
13074 PyErr_SetString(PyExc_ValueError, "substring not found");
13075 return NULL;
13076 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077
Christian Heimes217cfd12007-12-02 14:31:20 +000013078 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079}
13080
INADA Naoki3ae20562017-01-16 20:41:20 +090013081/*[clinic input]
13082str.rjust as unicode_rjust
13083
13084 width: Py_ssize_t
13085 fillchar: Py_UCS4 = ' '
13086 /
13087
13088Return a right-justified string of length width.
13089
13090Padding is done using the specified fill character (default is a space).
13091[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092
13093static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013094unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13095/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013096{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013097 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098 return NULL;
13099
Victor Stinnerc4b49542011-12-11 22:44:26 +010013100 if (PyUnicode_GET_LENGTH(self) >= width)
13101 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102
Victor Stinnerc4b49542011-12-11 22:44:26 +010013103 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104}
13105
Alexander Belopolsky40018472011-02-26 01:02:56 +000013106PyObject *
13107PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013109 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013110 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013112 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113}
13114
INADA Naoki3ae20562017-01-16 20:41:20 +090013115/*[clinic input]
13116str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117
INADA Naoki3ae20562017-01-16 20:41:20 +090013118 sep: object = None
13119 The delimiter according which to split the string.
13120 None (the default value) means split according to any whitespace,
13121 and discard empty strings from the result.
13122 maxsplit: Py_ssize_t = -1
13123 Maximum number of splits to do.
13124 -1 (the default value) means no limit.
13125
13126Return a list of the words in the string, using sep as the delimiter string.
13127[clinic start generated code]*/
13128
13129static PyObject *
13130unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13131/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132{
INADA Naoki3ae20562017-01-16 20:41:20 +090013133 if (sep == Py_None)
13134 return split(self, NULL, maxsplit);
13135 if (PyUnicode_Check(sep))
13136 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013137
Victor Stinner998b8062018-09-12 00:23:25 +020013138 PyErr_Format(PyExc_TypeError,
13139 "must be str or None, not %.100s",
13140 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142}
13143
Thomas Wouters477c8d52006-05-27 19:21:47 +000013144PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013145PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013146{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013147 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013148 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013149 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013151
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013152 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013153 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013154
Victor Stinner14f8f022011-10-05 20:58:25 +020013155 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013157 len1 = PyUnicode_GET_LENGTH(str_obj);
13158 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013159 if (kind1 < kind2 || len1 < len2) {
13160 _Py_INCREF_UNICODE_EMPTY();
13161 if (!unicode_empty)
13162 out = NULL;
13163 else {
13164 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13165 Py_DECREF(unicode_empty);
13166 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013167 return out;
13168 }
13169 buf1 = PyUnicode_DATA(str_obj);
13170 buf2 = PyUnicode_DATA(sep_obj);
13171 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013172 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013173 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013174 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013175 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013176
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013177 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013178 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013179 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13180 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13181 else
13182 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 break;
13184 case PyUnicode_2BYTE_KIND:
13185 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13186 break;
13187 case PyUnicode_4BYTE_KIND:
13188 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13189 break;
13190 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013191 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013193
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013194 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013195 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013196 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013197
13198 return out;
13199}
13200
13201
13202PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013203PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013204{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013205 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013206 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013207 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013209
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013210 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013211 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013212
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013213 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013214 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 len1 = PyUnicode_GET_LENGTH(str_obj);
13216 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013217 if (kind1 < kind2 || len1 < len2) {
13218 _Py_INCREF_UNICODE_EMPTY();
13219 if (!unicode_empty)
13220 out = NULL;
13221 else {
13222 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13223 Py_DECREF(unicode_empty);
13224 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013225 return out;
13226 }
13227 buf1 = PyUnicode_DATA(str_obj);
13228 buf2 = PyUnicode_DATA(sep_obj);
13229 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013230 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013231 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013232 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013233 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013235 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013237 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13238 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13239 else
13240 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013241 break;
13242 case PyUnicode_2BYTE_KIND:
13243 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13244 break;
13245 case PyUnicode_4BYTE_KIND:
13246 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13247 break;
13248 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013249 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013250 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013251
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013252 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013253 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013254 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013255
13256 return out;
13257}
13258
INADA Naoki3ae20562017-01-16 20:41:20 +090013259/*[clinic input]
13260str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013261
INADA Naoki3ae20562017-01-16 20:41:20 +090013262 sep: object
13263 /
13264
13265Partition the string into three parts using the given separator.
13266
13267This will search for the separator in the string. If the separator is found,
13268returns a 3-tuple containing the part before the separator, the separator
13269itself, and the part after it.
13270
13271If the separator is not found, returns a 3-tuple containing the original string
13272and two empty strings.
13273[clinic start generated code]*/
13274
13275static PyObject *
13276unicode_partition(PyObject *self, PyObject *sep)
13277/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013278{
INADA Naoki3ae20562017-01-16 20:41:20 +090013279 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013280}
13281
INADA Naoki3ae20562017-01-16 20:41:20 +090013282/*[clinic input]
13283str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013284
INADA Naoki3ae20562017-01-16 20:41:20 +090013285Partition the string into three parts using the given separator.
13286
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013287This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013288the separator is found, returns a 3-tuple containing the part before the
13289separator, the separator itself, and the part after it.
13290
13291If the separator is not found, returns a 3-tuple containing two empty strings
13292and the original string.
13293[clinic start generated code]*/
13294
13295static PyObject *
13296unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013297/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013298{
INADA Naoki3ae20562017-01-16 20:41:20 +090013299 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013300}
13301
Alexander Belopolsky40018472011-02-26 01:02:56 +000013302PyObject *
13303PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013304{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013305 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013306 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013307
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013308 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013309}
13310
INADA Naoki3ae20562017-01-16 20:41:20 +090013311/*[clinic input]
13312str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013313
INADA Naoki3ae20562017-01-16 20:41:20 +090013314Return a list of the words in the string, using sep as the delimiter string.
13315
13316Splits are done starting at the end of the string and working to the front.
13317[clinic start generated code]*/
13318
13319static PyObject *
13320unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13321/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013322{
INADA Naoki3ae20562017-01-16 20:41:20 +090013323 if (sep == Py_None)
13324 return rsplit(self, NULL, maxsplit);
13325 if (PyUnicode_Check(sep))
13326 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013327
Victor Stinner998b8062018-09-12 00:23:25 +020013328 PyErr_Format(PyExc_TypeError,
13329 "must be str or None, not %.100s",
13330 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013331 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013332}
13333
INADA Naoki3ae20562017-01-16 20:41:20 +090013334/*[clinic input]
13335str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013337 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013338
13339Return a list of the lines in the string, breaking at line boundaries.
13340
13341Line breaks are not included in the resulting list unless keepends is given and
13342true.
13343[clinic start generated code]*/
13344
13345static PyObject *
13346unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013347/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013348{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013349 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350}
13351
13352static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013353PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013354{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013355 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356}
13357
INADA Naoki3ae20562017-01-16 20:41:20 +090013358/*[clinic input]
13359str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360
INADA Naoki3ae20562017-01-16 20:41:20 +090013361Convert uppercase characters to lowercase and lowercase characters to uppercase.
13362[clinic start generated code]*/
13363
13364static PyObject *
13365unicode_swapcase_impl(PyObject *self)
13366/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013368 if (PyUnicode_READY(self) == -1)
13369 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013370 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371}
13372
Larry Hastings61272b72014-01-07 12:41:53 -080013373/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013374
Larry Hastings31826802013-10-19 00:09:25 -070013375@staticmethod
13376str.maketrans as unicode_maketrans
13377
13378 x: object
13379
13380 y: unicode=NULL
13381
13382 z: unicode=NULL
13383
13384 /
13385
13386Return a translation table usable for str.translate().
13387
13388If there is only one argument, it must be a dictionary mapping Unicode
13389ordinals (integers) or characters to Unicode ordinals, strings or None.
13390Character keys will be then converted to ordinals.
13391If there are two arguments, they must be strings of equal length, and
13392in the resulting dictionary, each character in x will be mapped to the
13393character at the same position in y. If there is a third argument, it
13394must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013395[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013396
Larry Hastings31826802013-10-19 00:09:25 -070013397static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013398unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013399/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013400{
Georg Brandlceee0772007-11-27 23:48:05 +000013401 PyObject *new = NULL, *key, *value;
13402 Py_ssize_t i = 0;
13403 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013404
Georg Brandlceee0772007-11-27 23:48:05 +000013405 new = PyDict_New();
13406 if (!new)
13407 return NULL;
13408 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013409 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013410 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013411
Georg Brandlceee0772007-11-27 23:48:05 +000013412 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013413 if (!PyUnicode_Check(x)) {
13414 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13415 "be a string if there is a second argument");
13416 goto err;
13417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013418 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013419 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13420 "arguments must have equal length");
13421 goto err;
13422 }
13423 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013424 x_kind = PyUnicode_KIND(x);
13425 y_kind = PyUnicode_KIND(y);
13426 x_data = PyUnicode_DATA(x);
13427 y_data = PyUnicode_DATA(y);
13428 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13429 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013430 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013431 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013432 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013433 if (!value) {
13434 Py_DECREF(key);
13435 goto err;
13436 }
Georg Brandlceee0772007-11-27 23:48:05 +000013437 res = PyDict_SetItem(new, key, value);
13438 Py_DECREF(key);
13439 Py_DECREF(value);
13440 if (res < 0)
13441 goto err;
13442 }
13443 /* create entries for deleting chars in z */
13444 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013445 z_kind = PyUnicode_KIND(z);
13446 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013447 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013448 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013449 if (!key)
13450 goto err;
13451 res = PyDict_SetItem(new, key, Py_None);
13452 Py_DECREF(key);
13453 if (res < 0)
13454 goto err;
13455 }
13456 }
13457 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013458 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013459 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013460
Georg Brandlceee0772007-11-27 23:48:05 +000013461 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013462 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013463 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13464 "to maketrans it must be a dict");
13465 goto err;
13466 }
13467 /* copy entries into the new dict, converting string keys to int keys */
13468 while (PyDict_Next(x, &i, &key, &value)) {
13469 if (PyUnicode_Check(key)) {
13470 /* convert string keys to integer keys */
13471 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013472 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013473 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13474 "table must be of length 1");
13475 goto err;
13476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013477 kind = PyUnicode_KIND(key);
13478 data = PyUnicode_DATA(key);
13479 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013480 if (!newkey)
13481 goto err;
13482 res = PyDict_SetItem(new, newkey, value);
13483 Py_DECREF(newkey);
13484 if (res < 0)
13485 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013486 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013487 /* just keep integer keys */
13488 if (PyDict_SetItem(new, key, value) < 0)
13489 goto err;
13490 } else {
13491 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13492 "be strings or integers");
13493 goto err;
13494 }
13495 }
13496 }
13497 return new;
13498 err:
13499 Py_DECREF(new);
13500 return NULL;
13501}
13502
INADA Naoki3ae20562017-01-16 20:41:20 +090013503/*[clinic input]
13504str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013505
INADA Naoki3ae20562017-01-16 20:41:20 +090013506 table: object
13507 Translation table, which must be a mapping of Unicode ordinals to
13508 Unicode ordinals, strings, or None.
13509 /
13510
13511Replace each character in the string using the given translation table.
13512
13513The table must implement lookup/indexing via __getitem__, for instance a
13514dictionary or list. If this operation raises LookupError, the character is
13515left untouched. Characters mapped to None are deleted.
13516[clinic start generated code]*/
13517
13518static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013519unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013520/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013521{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013522 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013523}
13524
INADA Naoki3ae20562017-01-16 20:41:20 +090013525/*[clinic input]
13526str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013527
INADA Naoki3ae20562017-01-16 20:41:20 +090013528Return a copy of the string converted to uppercase.
13529[clinic start generated code]*/
13530
13531static PyObject *
13532unicode_upper_impl(PyObject *self)
13533/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013534{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013535 if (PyUnicode_READY(self) == -1)
13536 return NULL;
13537 if (PyUnicode_IS_ASCII(self))
13538 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013539 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013540}
13541
INADA Naoki3ae20562017-01-16 20:41:20 +090013542/*[clinic input]
13543str.zfill as unicode_zfill
13544
13545 width: Py_ssize_t
13546 /
13547
13548Pad a numeric string with zeros on the left, to fill a field of the given width.
13549
13550The string is never truncated.
13551[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013552
13553static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013554unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013555/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013556{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013557 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013558 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013559 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013560 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013561 Py_UCS4 chr;
13562
Benjamin Petersonbac79492012-01-14 13:34:47 -050013563 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013564 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013565
Victor Stinnerc4b49542011-12-11 22:44:26 +010013566 if (PyUnicode_GET_LENGTH(self) >= width)
13567 return unicode_result_unchanged(self);
13568
13569 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013570
13571 u = pad(self, fill, 0, '0');
13572
Walter Dörwald068325e2002-04-15 13:36:47 +000013573 if (u == NULL)
13574 return NULL;
13575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013576 kind = PyUnicode_KIND(u);
13577 data = PyUnicode_DATA(u);
13578 chr = PyUnicode_READ(kind, data, fill);
13579
13580 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013581 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013582 PyUnicode_WRITE(kind, data, 0, chr);
13583 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013584 }
13585
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013586 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013587 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013588}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013589
13590#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013591static PyObject *
13592unicode__decimal2ascii(PyObject *self)
13593{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013594 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013595}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013596#endif
13597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013598PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013599 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013600\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013601Return True if S starts with the specified prefix, False otherwise.\n\
13602With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013603With optional end, stop comparing S at that position.\n\
13604prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013605
13606static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013607unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013609{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013610 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013611 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013612 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013613 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013614 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013615
Jesus Ceaac451502011-04-20 17:09:23 +020013616 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013617 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013618 if (PyTuple_Check(subobj)) {
13619 Py_ssize_t i;
13620 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013621 substring = PyTuple_GET_ITEM(subobj, i);
13622 if (!PyUnicode_Check(substring)) {
13623 PyErr_Format(PyExc_TypeError,
13624 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013625 "not %.100s",
13626 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013627 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013628 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013629 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013630 if (result == -1)
13631 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013632 if (result) {
13633 Py_RETURN_TRUE;
13634 }
13635 }
13636 /* nothing matched */
13637 Py_RETURN_FALSE;
13638 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013639 if (!PyUnicode_Check(subobj)) {
13640 PyErr_Format(PyExc_TypeError,
13641 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013642 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013643 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013644 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013645 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013646 if (result == -1)
13647 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013648 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013649}
13650
13651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013652PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013653 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013654\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013655Return True if S ends with the specified suffix, False otherwise.\n\
13656With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013657With optional end, stop comparing S at that position.\n\
13658suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013659
13660static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013661unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013662 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013663{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013664 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013665 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013666 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013667 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013668 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013669
Jesus Ceaac451502011-04-20 17:09:23 +020013670 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013672 if (PyTuple_Check(subobj)) {
13673 Py_ssize_t i;
13674 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013675 substring = PyTuple_GET_ITEM(subobj, i);
13676 if (!PyUnicode_Check(substring)) {
13677 PyErr_Format(PyExc_TypeError,
13678 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013679 "not %.100s",
13680 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013682 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013683 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013684 if (result == -1)
13685 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013686 if (result) {
13687 Py_RETURN_TRUE;
13688 }
13689 }
13690 Py_RETURN_FALSE;
13691 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013692 if (!PyUnicode_Check(subobj)) {
13693 PyErr_Format(PyExc_TypeError,
13694 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013695 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013696 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013697 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013698 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013699 if (result == -1)
13700 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013701 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013702}
13703
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013704static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013705_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013706{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013707 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13708 writer->data = PyUnicode_DATA(writer->buffer);
13709
13710 if (!writer->readonly) {
13711 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013712 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013713 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013714 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013715 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13716 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13717 writer->kind = PyUnicode_WCHAR_KIND;
13718 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13719
Victor Stinner8f674cc2013-04-17 23:02:17 +020013720 /* Copy-on-write mode: set buffer size to 0 so
13721 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13722 * next write. */
13723 writer->size = 0;
13724 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013725}
13726
Victor Stinnerd3f08822012-05-29 12:57:52 +020013727void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013728_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013729{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013730 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013731
13732 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013733 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013734
13735 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13736 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13737 writer->kind = PyUnicode_WCHAR_KIND;
13738 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013739}
13740
Inada Naoki770847a2019-06-24 12:30:24 +090013741// Initialize _PyUnicodeWriter with initial buffer
13742static inline void
13743_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13744{
13745 memset(writer, 0, sizeof(*writer));
13746 writer->buffer = buffer;
13747 _PyUnicodeWriter_Update(writer);
13748 writer->min_length = writer->size;
13749}
13750
Victor Stinnerd3f08822012-05-29 12:57:52 +020013751int
13752_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13753 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013754{
13755 Py_ssize_t newlen;
13756 PyObject *newbuffer;
13757
Victor Stinner2740e462016-09-06 16:58:36 -070013758 assert(maxchar <= MAX_UNICODE);
13759
Victor Stinnerca9381e2015-09-22 00:58:32 +020013760 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013761 assert((maxchar > writer->maxchar && length >= 0)
13762 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013763
Victor Stinner202fdca2012-05-07 12:47:02 +020013764 if (length > PY_SSIZE_T_MAX - writer->pos) {
13765 PyErr_NoMemory();
13766 return -1;
13767 }
13768 newlen = writer->pos + length;
13769
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013770 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013771
Victor Stinnerd3f08822012-05-29 12:57:52 +020013772 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013773 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013774 if (writer->overallocate
13775 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13776 /* overallocate to limit the number of realloc() */
13777 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013778 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013779 if (newlen < writer->min_length)
13780 newlen = writer->min_length;
13781
Victor Stinnerd3f08822012-05-29 12:57:52 +020013782 writer->buffer = PyUnicode_New(newlen, maxchar);
13783 if (writer->buffer == NULL)
13784 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013785 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013786 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013787 if (writer->overallocate
13788 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13789 /* overallocate to limit the number of realloc() */
13790 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013791 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013792 if (newlen < writer->min_length)
13793 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013794
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013795 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013796 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013797 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013798 newbuffer = PyUnicode_New(newlen, maxchar);
13799 if (newbuffer == NULL)
13800 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013801 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13802 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013803 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013804 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013805 }
13806 else {
13807 newbuffer = resize_compact(writer->buffer, newlen);
13808 if (newbuffer == NULL)
13809 return -1;
13810 }
13811 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013812 }
13813 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013814 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013815 newbuffer = PyUnicode_New(writer->size, maxchar);
13816 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013817 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013818 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13819 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013820 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013821 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013822 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013823 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013824
13825#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013826}
13827
Victor Stinnerca9381e2015-09-22 00:58:32 +020013828int
13829_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13830 enum PyUnicode_Kind kind)
13831{
13832 Py_UCS4 maxchar;
13833
13834 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13835 assert(writer->kind < kind);
13836
13837 switch (kind)
13838 {
13839 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13840 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13841 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13842 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013843 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013844 }
13845
13846 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13847}
13848
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013849static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013850_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013851{
Victor Stinner2740e462016-09-06 16:58:36 -070013852 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013853 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13854 return -1;
13855 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13856 writer->pos++;
13857 return 0;
13858}
13859
13860int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013861_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13862{
13863 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13864}
13865
13866int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013867_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13868{
13869 Py_UCS4 maxchar;
13870 Py_ssize_t len;
13871
13872 if (PyUnicode_READY(str) == -1)
13873 return -1;
13874 len = PyUnicode_GET_LENGTH(str);
13875 if (len == 0)
13876 return 0;
13877 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13878 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013879 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013880 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013881 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013882 Py_INCREF(str);
13883 writer->buffer = str;
13884 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013885 writer->pos += len;
13886 return 0;
13887 }
13888 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13889 return -1;
13890 }
13891 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13892 str, 0, len);
13893 writer->pos += len;
13894 return 0;
13895}
13896
Victor Stinnere215d962012-10-06 23:03:36 +020013897int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013898_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13899 Py_ssize_t start, Py_ssize_t end)
13900{
13901 Py_UCS4 maxchar;
13902 Py_ssize_t len;
13903
13904 if (PyUnicode_READY(str) == -1)
13905 return -1;
13906
13907 assert(0 <= start);
13908 assert(end <= PyUnicode_GET_LENGTH(str));
13909 assert(start <= end);
13910
13911 if (end == 0)
13912 return 0;
13913
13914 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13915 return _PyUnicodeWriter_WriteStr(writer, str);
13916
13917 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13918 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13919 else
13920 maxchar = writer->maxchar;
13921 len = end - start;
13922
13923 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13924 return -1;
13925
13926 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13927 str, start, len);
13928 writer->pos += len;
13929 return 0;
13930}
13931
13932int
Victor Stinner4a587072013-11-19 12:54:53 +010013933_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13934 const char *ascii, Py_ssize_t len)
13935{
13936 if (len == -1)
13937 len = strlen(ascii);
13938
Andy Lestere6be9b52020-02-11 20:28:35 -060013939 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010013940
13941 if (writer->buffer == NULL && !writer->overallocate) {
13942 PyObject *str;
13943
13944 str = _PyUnicode_FromASCII(ascii, len);
13945 if (str == NULL)
13946 return -1;
13947
13948 writer->readonly = 1;
13949 writer->buffer = str;
13950 _PyUnicodeWriter_Update(writer);
13951 writer->pos += len;
13952 return 0;
13953 }
13954
13955 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13956 return -1;
13957
13958 switch (writer->kind)
13959 {
13960 case PyUnicode_1BYTE_KIND:
13961 {
13962 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13963 Py_UCS1 *data = writer->data;
13964
Christian Heimesf051e432016-09-13 20:22:02 +020013965 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013966 break;
13967 }
13968 case PyUnicode_2BYTE_KIND:
13969 {
13970 _PyUnicode_CONVERT_BYTES(
13971 Py_UCS1, Py_UCS2,
13972 ascii, ascii + len,
13973 (Py_UCS2 *)writer->data + writer->pos);
13974 break;
13975 }
13976 case PyUnicode_4BYTE_KIND:
13977 {
13978 _PyUnicode_CONVERT_BYTES(
13979 Py_UCS1, Py_UCS4,
13980 ascii, ascii + len,
13981 (Py_UCS4 *)writer->data + writer->pos);
13982 break;
13983 }
13984 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013985 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013986 }
13987
13988 writer->pos += len;
13989 return 0;
13990}
13991
13992int
13993_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13994 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013995{
13996 Py_UCS4 maxchar;
13997
Andy Lestere6be9b52020-02-11 20:28:35 -060013998 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020013999 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14000 return -1;
14001 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14002 writer->pos += len;
14003 return 0;
14004}
14005
Victor Stinnerd3f08822012-05-29 12:57:52 +020014006PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014007_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014008{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014009 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014010
Victor Stinnerd3f08822012-05-29 12:57:52 +020014011 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014012 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014013 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014014 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014015
14016 str = writer->buffer;
14017 writer->buffer = NULL;
14018
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014019 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014020 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14021 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014022 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014023
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014024 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14025 PyObject *str2;
14026 str2 = resize_compact(str, writer->pos);
14027 if (str2 == NULL) {
14028 Py_DECREF(str);
14029 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014030 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014031 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014032 }
14033
Victor Stinner15a0bd32013-07-08 22:29:55 +020014034 assert(_PyUnicode_CheckConsistency(str, 1));
14035 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014036}
14037
Victor Stinnerd3f08822012-05-29 12:57:52 +020014038void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014039_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014040{
14041 Py_CLEAR(writer->buffer);
14042}
14043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014044#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014045
14046PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014047 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014048\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014049Return a formatted version of S, using substitutions from args and kwargs.\n\
14050The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014051
Eric Smith27bbca62010-11-04 17:06:58 +000014052PyDoc_STRVAR(format_map__doc__,
14053 "S.format_map(mapping) -> str\n\
14054\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014055Return a formatted version of S, using substitutions from mapping.\n\
14056The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014057
INADA Naoki3ae20562017-01-16 20:41:20 +090014058/*[clinic input]
14059str.__format__ as unicode___format__
14060
14061 format_spec: unicode
14062 /
14063
14064Return a formatted version of the string as described by format_spec.
14065[clinic start generated code]*/
14066
Eric Smith4a7d76d2008-05-30 18:10:19 +000014067static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014068unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014069/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014070{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014071 _PyUnicodeWriter writer;
14072 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014073
Victor Stinnerd3f08822012-05-29 12:57:52 +020014074 if (PyUnicode_READY(self) == -1)
14075 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014076 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014077 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14078 self, format_spec, 0,
14079 PyUnicode_GET_LENGTH(format_spec));
14080 if (ret == -1) {
14081 _PyUnicodeWriter_Dealloc(&writer);
14082 return NULL;
14083 }
14084 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014085}
14086
INADA Naoki3ae20562017-01-16 20:41:20 +090014087/*[clinic input]
14088str.__sizeof__ as unicode_sizeof
14089
14090Return the size of the string in memory, in bytes.
14091[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014092
14093static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014094unicode_sizeof_impl(PyObject *self)
14095/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014096{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014097 Py_ssize_t size;
14098
14099 /* If it's a compact object, account for base structure +
14100 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014101 if (PyUnicode_IS_COMPACT_ASCII(self))
14102 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14103 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014104 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014105 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014106 else {
14107 /* If it is a two-block object, account for base object, and
14108 for character block if present. */
14109 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014110 if (_PyUnicode_DATA_ANY(self))
14111 size += (PyUnicode_GET_LENGTH(self) + 1) *
14112 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014113 }
14114 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014115 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014116 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14117 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14118 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14119 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014120
14121 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014122}
14123
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014124static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014125unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014126{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014127 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014128 if (!copy)
14129 return NULL;
14130 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014131}
14132
Guido van Rossumd57fd912000-03-10 22:53:23 +000014133static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014134 UNICODE_ENCODE_METHODDEF
14135 UNICODE_REPLACE_METHODDEF
14136 UNICODE_SPLIT_METHODDEF
14137 UNICODE_RSPLIT_METHODDEF
14138 UNICODE_JOIN_METHODDEF
14139 UNICODE_CAPITALIZE_METHODDEF
14140 UNICODE_CASEFOLD_METHODDEF
14141 UNICODE_TITLE_METHODDEF
14142 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014143 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014144 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014145 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014146 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014147 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014148 UNICODE_LJUST_METHODDEF
14149 UNICODE_LOWER_METHODDEF
14150 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014151 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14152 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014153 UNICODE_RJUST_METHODDEF
14154 UNICODE_RSTRIP_METHODDEF
14155 UNICODE_RPARTITION_METHODDEF
14156 UNICODE_SPLITLINES_METHODDEF
14157 UNICODE_STRIP_METHODDEF
14158 UNICODE_SWAPCASE_METHODDEF
14159 UNICODE_TRANSLATE_METHODDEF
14160 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014161 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14162 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014163 UNICODE_REMOVEPREFIX_METHODDEF
14164 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014165 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014166 UNICODE_ISLOWER_METHODDEF
14167 UNICODE_ISUPPER_METHODDEF
14168 UNICODE_ISTITLE_METHODDEF
14169 UNICODE_ISSPACE_METHODDEF
14170 UNICODE_ISDECIMAL_METHODDEF
14171 UNICODE_ISDIGIT_METHODDEF
14172 UNICODE_ISNUMERIC_METHODDEF
14173 UNICODE_ISALPHA_METHODDEF
14174 UNICODE_ISALNUM_METHODDEF
14175 UNICODE_ISIDENTIFIER_METHODDEF
14176 UNICODE_ISPRINTABLE_METHODDEF
14177 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014178 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014179 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014180 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014181 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014182 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014183#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014184 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014185 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014186#endif
14187
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014188 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014189 {NULL, NULL}
14190};
14191
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014192static PyObject *
14193unicode_mod(PyObject *v, PyObject *w)
14194{
Brian Curtindfc80e32011-08-10 20:28:54 -050014195 if (!PyUnicode_Check(v))
14196 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014197 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014198}
14199
14200static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014201 0, /*nb_add*/
14202 0, /*nb_subtract*/
14203 0, /*nb_multiply*/
14204 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014205};
14206
Guido van Rossumd57fd912000-03-10 22:53:23 +000014207static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014208 (lenfunc) unicode_length, /* sq_length */
14209 PyUnicode_Concat, /* sq_concat */
14210 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14211 (ssizeargfunc) unicode_getitem, /* sq_item */
14212 0, /* sq_slice */
14213 0, /* sq_ass_item */
14214 0, /* sq_ass_slice */
14215 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014216};
14217
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014218static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014219unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014220{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014221 if (PyUnicode_READY(self) == -1)
14222 return NULL;
14223
Victor Stinnera15e2602020-04-08 02:01:56 +020014224 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014225 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014226 if (i == -1 && PyErr_Occurred())
14227 return NULL;
14228 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014229 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014230 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014231 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014232 Py_ssize_t start, stop, step, slicelength, i;
14233 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014234 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014235 const void *src_data;
14236 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014237 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014238 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014239
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014240 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014241 return NULL;
14242 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014243 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14244 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014245
14246 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014247 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014248 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014249 slicelength == PyUnicode_GET_LENGTH(self)) {
14250 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014251 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014252 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014253 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014254 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014255 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014256 src_kind = PyUnicode_KIND(self);
14257 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014258 if (!PyUnicode_IS_ASCII(self)) {
14259 kind_limit = kind_maxchar_limit(src_kind);
14260 max_char = 0;
14261 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14262 ch = PyUnicode_READ(src_kind, src_data, cur);
14263 if (ch > max_char) {
14264 max_char = ch;
14265 if (max_char >= kind_limit)
14266 break;
14267 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014268 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014269 }
Victor Stinner55c99112011-10-13 01:17:06 +020014270 else
14271 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014272 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014273 if (result == NULL)
14274 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014275 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014276 dest_data = PyUnicode_DATA(result);
14277
14278 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014279 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14280 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014281 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014282 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014283 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014284 } else {
14285 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14286 return NULL;
14287 }
14288}
14289
14290static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014291 (lenfunc)unicode_length, /* mp_length */
14292 (binaryfunc)unicode_subscript, /* mp_subscript */
14293 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014294};
14295
Guido van Rossumd57fd912000-03-10 22:53:23 +000014296
Guido van Rossumd57fd912000-03-10 22:53:23 +000014297/* Helpers for PyUnicode_Format() */
14298
Victor Stinnera47082312012-10-04 02:19:54 +020014299struct unicode_formatter_t {
14300 PyObject *args;
14301 int args_owned;
14302 Py_ssize_t arglen, argidx;
14303 PyObject *dict;
14304
14305 enum PyUnicode_Kind fmtkind;
14306 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014307 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014308 PyObject *fmtstr;
14309
14310 _PyUnicodeWriter writer;
14311};
14312
14313struct unicode_format_arg_t {
14314 Py_UCS4 ch;
14315 int flags;
14316 Py_ssize_t width;
14317 int prec;
14318 int sign;
14319};
14320
Guido van Rossumd57fd912000-03-10 22:53:23 +000014321static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014322unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014323{
Victor Stinnera47082312012-10-04 02:19:54 +020014324 Py_ssize_t argidx = ctx->argidx;
14325
14326 if (argidx < ctx->arglen) {
14327 ctx->argidx++;
14328 if (ctx->arglen < 0)
14329 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014330 else
Victor Stinnera47082312012-10-04 02:19:54 +020014331 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014332 }
14333 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014334 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014335 return NULL;
14336}
14337
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014338/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014339
Victor Stinnera47082312012-10-04 02:19:54 +020014340/* Format a float into the writer if the writer is not NULL, or into *p_output
14341 otherwise.
14342
14343 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014344static int
Victor Stinnera47082312012-10-04 02:19:54 +020014345formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14346 PyObject **p_output,
14347 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014348{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014349 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014350 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014351 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014352 int prec;
14353 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014354
Guido van Rossumd57fd912000-03-10 22:53:23 +000014355 x = PyFloat_AsDouble(v);
14356 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014357 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014358
Victor Stinnera47082312012-10-04 02:19:54 +020014359 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014360 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014361 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014362
Victor Stinnera47082312012-10-04 02:19:54 +020014363 if (arg->flags & F_ALT)
14364 dtoa_flags = Py_DTSF_ALT;
14365 else
14366 dtoa_flags = 0;
14367 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014368 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014369 return -1;
14370 len = strlen(p);
14371 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014372 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014373 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014374 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014375 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014376 }
14377 else
14378 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014379 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014380 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014381}
14382
Victor Stinnerd0880d52012-04-27 23:40:13 +020014383/* formatlong() emulates the format codes d, u, o, x and X, and
14384 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14385 * Python's regular ints.
14386 * Return value: a new PyUnicodeObject*, or NULL if error.
14387 * The output string is of the form
14388 * "-"? ("0x" | "0X")? digit+
14389 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14390 * set in flags. The case of hex digits will be correct,
14391 * There will be at least prec digits, zero-filled on the left if
14392 * necessary to get that many.
14393 * val object to be converted
14394 * flags bitmask of format flags; only F_ALT is looked at
14395 * prec minimum number of digits; 0-fill on left if needed
14396 * type a character in [duoxX]; u acts the same as d
14397 *
14398 * CAUTION: o, x and X conversions on regular ints can never
14399 * produce a '-' sign, but can for Python's unbounded ints.
14400 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014401PyObject *
14402_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014403{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014404 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014405 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014406 Py_ssize_t i;
14407 int sign; /* 1 if '-', else 0 */
14408 int len; /* number of characters */
14409 Py_ssize_t llen;
14410 int numdigits; /* len == numnondigits + numdigits */
14411 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014412
Victor Stinnerd0880d52012-04-27 23:40:13 +020014413 /* Avoid exceeding SSIZE_T_MAX */
14414 if (prec > INT_MAX-3) {
14415 PyErr_SetString(PyExc_OverflowError,
14416 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014417 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014418 }
14419
14420 assert(PyLong_Check(val));
14421
14422 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014423 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014424 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014425 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014426 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014427 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014428 /* int and int subclasses should print numerically when a numeric */
14429 /* format code is used (see issue18780) */
14430 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014431 break;
14432 case 'o':
14433 numnondigits = 2;
14434 result = PyNumber_ToBase(val, 8);
14435 break;
14436 case 'x':
14437 case 'X':
14438 numnondigits = 2;
14439 result = PyNumber_ToBase(val, 16);
14440 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014441 }
14442 if (!result)
14443 return NULL;
14444
14445 assert(unicode_modifiable(result));
14446 assert(PyUnicode_IS_READY(result));
14447 assert(PyUnicode_IS_ASCII(result));
14448
14449 /* To modify the string in-place, there can only be one reference. */
14450 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014451 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014452 PyErr_BadInternalCall();
14453 return NULL;
14454 }
14455 buf = PyUnicode_DATA(result);
14456 llen = PyUnicode_GET_LENGTH(result);
14457 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014458 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014459 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014460 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014461 return NULL;
14462 }
14463 len = (int)llen;
14464 sign = buf[0] == '-';
14465 numnondigits += sign;
14466 numdigits = len - numnondigits;
14467 assert(numdigits > 0);
14468
14469 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014470 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014471 (type == 'o' || type == 'x' || type == 'X'))) {
14472 assert(buf[sign] == '0');
14473 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14474 buf[sign+1] == 'o');
14475 numnondigits -= 2;
14476 buf += 2;
14477 len -= 2;
14478 if (sign)
14479 buf[0] = '-';
14480 assert(len == numnondigits + numdigits);
14481 assert(numdigits > 0);
14482 }
14483
14484 /* Fill with leading zeroes to meet minimum width. */
14485 if (prec > numdigits) {
14486 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14487 numnondigits + prec);
14488 char *b1;
14489 if (!r1) {
14490 Py_DECREF(result);
14491 return NULL;
14492 }
14493 b1 = PyBytes_AS_STRING(r1);
14494 for (i = 0; i < numnondigits; ++i)
14495 *b1++ = *buf++;
14496 for (i = 0; i < prec - numdigits; i++)
14497 *b1++ = '0';
14498 for (i = 0; i < numdigits; i++)
14499 *b1++ = *buf++;
14500 *b1 = '\0';
14501 Py_DECREF(result);
14502 result = r1;
14503 buf = PyBytes_AS_STRING(result);
14504 len = numnondigits + prec;
14505 }
14506
14507 /* Fix up case for hex conversions. */
14508 if (type == 'X') {
14509 /* Need to convert all lower case letters to upper case.
14510 and need to convert 0x to 0X (and -0x to -0X). */
14511 for (i = 0; i < len; i++)
14512 if (buf[i] >= 'a' && buf[i] <= 'x')
14513 buf[i] -= 'a'-'A';
14514 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014515 if (!PyUnicode_Check(result)
14516 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014517 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014518 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014519 Py_DECREF(result);
14520 result = unicode;
14521 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014522 else if (len != PyUnicode_GET_LENGTH(result)) {
14523 if (PyUnicode_Resize(&result, len) < 0)
14524 Py_CLEAR(result);
14525 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014526 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014527}
14528
Ethan Furmandf3ed242014-01-05 06:50:30 -080014529/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014530 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014531 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014532 * -1 and raise an exception on error */
14533static int
Victor Stinnera47082312012-10-04 02:19:54 +020014534mainformatlong(PyObject *v,
14535 struct unicode_format_arg_t *arg,
14536 PyObject **p_output,
14537 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014538{
14539 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014540 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014541
14542 if (!PyNumber_Check(v))
14543 goto wrongtype;
14544
Ethan Furman9ab74802014-03-21 06:38:46 -070014545 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014546 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014547 if (type == 'o' || type == 'x' || type == 'X') {
14548 iobj = PyNumber_Index(v);
14549 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014550 if (PyErr_ExceptionMatches(PyExc_TypeError))
14551 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014552 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014553 }
14554 }
14555 else {
14556 iobj = PyNumber_Long(v);
14557 if (iobj == NULL ) {
14558 if (PyErr_ExceptionMatches(PyExc_TypeError))
14559 goto wrongtype;
14560 return -1;
14561 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014562 }
14563 assert(PyLong_Check(iobj));
14564 }
14565 else {
14566 iobj = v;
14567 Py_INCREF(iobj);
14568 }
14569
14570 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014571 && arg->width == -1 && arg->prec == -1
14572 && !(arg->flags & (F_SIGN | F_BLANK))
14573 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014574 {
14575 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014576 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014577 int base;
14578
Victor Stinnera47082312012-10-04 02:19:54 +020014579 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014580 {
14581 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014582 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014583 case 'd':
14584 case 'i':
14585 case 'u':
14586 base = 10;
14587 break;
14588 case 'o':
14589 base = 8;
14590 break;
14591 case 'x':
14592 case 'X':
14593 base = 16;
14594 break;
14595 }
14596
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014597 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14598 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014599 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014600 }
14601 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014602 return 1;
14603 }
14604
Ethan Furmanb95b5612015-01-23 20:05:18 -080014605 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014606 Py_DECREF(iobj);
14607 if (res == NULL)
14608 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014609 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014610 return 0;
14611
14612wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014613 switch(type)
14614 {
14615 case 'o':
14616 case 'x':
14617 case 'X':
14618 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014619 "%%%c format: an integer is required, "
14620 "not %.200s",
14621 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014622 break;
14623 default:
14624 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014625 "%%%c format: a number is required, "
14626 "not %.200s",
14627 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014628 break;
14629 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014630 return -1;
14631}
14632
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014633static Py_UCS4
14634formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014635{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014636 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014637 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014638 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014639 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014640 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014641 goto onError;
14642 }
14643 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014644 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014645 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014646 /* make sure number is a type of integer */
14647 if (!PyLong_Check(v)) {
14648 iobj = PyNumber_Index(v);
14649 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014650 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014651 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014652 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014653 Py_DECREF(iobj);
14654 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014655 else {
14656 x = PyLong_AsLong(v);
14657 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014658 if (x == -1 && PyErr_Occurred())
14659 goto onError;
14660
Victor Stinner8faf8212011-12-08 22:14:11 +010014661 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014662 PyErr_SetString(PyExc_OverflowError,
14663 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014664 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014665 }
14666
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014667 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014668 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014669
Benjamin Peterson29060642009-01-31 22:14:21 +000014670 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014671 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014672 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014673 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014674}
14675
Victor Stinnera47082312012-10-04 02:19:54 +020014676/* Parse options of an argument: flags, width, precision.
14677 Handle also "%(name)" syntax.
14678
14679 Return 0 if the argument has been formatted into arg->str.
14680 Return 1 if the argument has been written into ctx->writer,
14681 Raise an exception and return -1 on error. */
14682static int
14683unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14684 struct unicode_format_arg_t *arg)
14685{
14686#define FORMAT_READ(ctx) \
14687 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14688
14689 PyObject *v;
14690
Victor Stinnera47082312012-10-04 02:19:54 +020014691 if (arg->ch == '(') {
14692 /* Get argument value from a dictionary. Example: "%(name)s". */
14693 Py_ssize_t keystart;
14694 Py_ssize_t keylen;
14695 PyObject *key;
14696 int pcount = 1;
14697
14698 if (ctx->dict == NULL) {
14699 PyErr_SetString(PyExc_TypeError,
14700 "format requires a mapping");
14701 return -1;
14702 }
14703 ++ctx->fmtpos;
14704 --ctx->fmtcnt;
14705 keystart = ctx->fmtpos;
14706 /* Skip over balanced parentheses */
14707 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14708 arg->ch = FORMAT_READ(ctx);
14709 if (arg->ch == ')')
14710 --pcount;
14711 else if (arg->ch == '(')
14712 ++pcount;
14713 ctx->fmtpos++;
14714 }
14715 keylen = ctx->fmtpos - keystart - 1;
14716 if (ctx->fmtcnt < 0 || pcount > 0) {
14717 PyErr_SetString(PyExc_ValueError,
14718 "incomplete format key");
14719 return -1;
14720 }
14721 key = PyUnicode_Substring(ctx->fmtstr,
14722 keystart, keystart + keylen);
14723 if (key == NULL)
14724 return -1;
14725 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014726 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014727 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014728 }
14729 ctx->args = PyObject_GetItem(ctx->dict, key);
14730 Py_DECREF(key);
14731 if (ctx->args == NULL)
14732 return -1;
14733 ctx->args_owned = 1;
14734 ctx->arglen = -1;
14735 ctx->argidx = -2;
14736 }
14737
14738 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014739 while (--ctx->fmtcnt >= 0) {
14740 arg->ch = FORMAT_READ(ctx);
14741 ctx->fmtpos++;
14742 switch (arg->ch) {
14743 case '-': arg->flags |= F_LJUST; continue;
14744 case '+': arg->flags |= F_SIGN; continue;
14745 case ' ': arg->flags |= F_BLANK; continue;
14746 case '#': arg->flags |= F_ALT; continue;
14747 case '0': arg->flags |= F_ZERO; continue;
14748 }
14749 break;
14750 }
14751
14752 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014753 if (arg->ch == '*') {
14754 v = unicode_format_getnextarg(ctx);
14755 if (v == NULL)
14756 return -1;
14757 if (!PyLong_Check(v)) {
14758 PyErr_SetString(PyExc_TypeError,
14759 "* wants int");
14760 return -1;
14761 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014762 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014763 if (arg->width == -1 && PyErr_Occurred())
14764 return -1;
14765 if (arg->width < 0) {
14766 arg->flags |= F_LJUST;
14767 arg->width = -arg->width;
14768 }
14769 if (--ctx->fmtcnt >= 0) {
14770 arg->ch = FORMAT_READ(ctx);
14771 ctx->fmtpos++;
14772 }
14773 }
14774 else if (arg->ch >= '0' && arg->ch <= '9') {
14775 arg->width = arg->ch - '0';
14776 while (--ctx->fmtcnt >= 0) {
14777 arg->ch = FORMAT_READ(ctx);
14778 ctx->fmtpos++;
14779 if (arg->ch < '0' || arg->ch > '9')
14780 break;
14781 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14782 mixing signed and unsigned comparison. Since arg->ch is between
14783 '0' and '9', casting to int is safe. */
14784 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14785 PyErr_SetString(PyExc_ValueError,
14786 "width too big");
14787 return -1;
14788 }
14789 arg->width = arg->width*10 + (arg->ch - '0');
14790 }
14791 }
14792
14793 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014794 if (arg->ch == '.') {
14795 arg->prec = 0;
14796 if (--ctx->fmtcnt >= 0) {
14797 arg->ch = FORMAT_READ(ctx);
14798 ctx->fmtpos++;
14799 }
14800 if (arg->ch == '*') {
14801 v = unicode_format_getnextarg(ctx);
14802 if (v == NULL)
14803 return -1;
14804 if (!PyLong_Check(v)) {
14805 PyErr_SetString(PyExc_TypeError,
14806 "* wants int");
14807 return -1;
14808 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014809 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014810 if (arg->prec == -1 && PyErr_Occurred())
14811 return -1;
14812 if (arg->prec < 0)
14813 arg->prec = 0;
14814 if (--ctx->fmtcnt >= 0) {
14815 arg->ch = FORMAT_READ(ctx);
14816 ctx->fmtpos++;
14817 }
14818 }
14819 else if (arg->ch >= '0' && arg->ch <= '9') {
14820 arg->prec = arg->ch - '0';
14821 while (--ctx->fmtcnt >= 0) {
14822 arg->ch = FORMAT_READ(ctx);
14823 ctx->fmtpos++;
14824 if (arg->ch < '0' || arg->ch > '9')
14825 break;
14826 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14827 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014828 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014829 return -1;
14830 }
14831 arg->prec = arg->prec*10 + (arg->ch - '0');
14832 }
14833 }
14834 }
14835
14836 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14837 if (ctx->fmtcnt >= 0) {
14838 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14839 if (--ctx->fmtcnt >= 0) {
14840 arg->ch = FORMAT_READ(ctx);
14841 ctx->fmtpos++;
14842 }
14843 }
14844 }
14845 if (ctx->fmtcnt < 0) {
14846 PyErr_SetString(PyExc_ValueError,
14847 "incomplete format");
14848 return -1;
14849 }
14850 return 0;
14851
14852#undef FORMAT_READ
14853}
14854
14855/* Format one argument. Supported conversion specifiers:
14856
14857 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014858 - "i", "d", "u": int or float
14859 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014860 - "e", "E", "f", "F", "g", "G": float
14861 - "c": int or str (1 character)
14862
Victor Stinner8dbd4212012-12-04 09:30:24 +010014863 When possible, the output is written directly into the Unicode writer
14864 (ctx->writer). A string is created when padding is required.
14865
Victor Stinnera47082312012-10-04 02:19:54 +020014866 Return 0 if the argument has been formatted into *p_str,
14867 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014868 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014869static int
14870unicode_format_arg_format(struct unicode_formatter_t *ctx,
14871 struct unicode_format_arg_t *arg,
14872 PyObject **p_str)
14873{
14874 PyObject *v;
14875 _PyUnicodeWriter *writer = &ctx->writer;
14876
14877 if (ctx->fmtcnt == 0)
14878 ctx->writer.overallocate = 0;
14879
Victor Stinnera47082312012-10-04 02:19:54 +020014880 v = unicode_format_getnextarg(ctx);
14881 if (v == NULL)
14882 return -1;
14883
Victor Stinnera47082312012-10-04 02:19:54 +020014884
14885 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014886 case 's':
14887 case 'r':
14888 case 'a':
14889 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14890 /* Fast path */
14891 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14892 return -1;
14893 return 1;
14894 }
14895
14896 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14897 *p_str = v;
14898 Py_INCREF(*p_str);
14899 }
14900 else {
14901 if (arg->ch == 's')
14902 *p_str = PyObject_Str(v);
14903 else if (arg->ch == 'r')
14904 *p_str = PyObject_Repr(v);
14905 else
14906 *p_str = PyObject_ASCII(v);
14907 }
14908 break;
14909
14910 case 'i':
14911 case 'd':
14912 case 'u':
14913 case 'o':
14914 case 'x':
14915 case 'X':
14916 {
14917 int ret = mainformatlong(v, arg, p_str, writer);
14918 if (ret != 0)
14919 return ret;
14920 arg->sign = 1;
14921 break;
14922 }
14923
14924 case 'e':
14925 case 'E':
14926 case 'f':
14927 case 'F':
14928 case 'g':
14929 case 'G':
14930 if (arg->width == -1 && arg->prec == -1
14931 && !(arg->flags & (F_SIGN | F_BLANK)))
14932 {
14933 /* Fast path */
14934 if (formatfloat(v, arg, NULL, writer) == -1)
14935 return -1;
14936 return 1;
14937 }
14938
14939 arg->sign = 1;
14940 if (formatfloat(v, arg, p_str, NULL) == -1)
14941 return -1;
14942 break;
14943
14944 case 'c':
14945 {
14946 Py_UCS4 ch = formatchar(v);
14947 if (ch == (Py_UCS4) -1)
14948 return -1;
14949 if (arg->width == -1 && arg->prec == -1) {
14950 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014951 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014952 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014953 return 1;
14954 }
14955 *p_str = PyUnicode_FromOrdinal(ch);
14956 break;
14957 }
14958
14959 default:
14960 PyErr_Format(PyExc_ValueError,
14961 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014962 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014963 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14964 (int)arg->ch,
14965 ctx->fmtpos - 1);
14966 return -1;
14967 }
14968 if (*p_str == NULL)
14969 return -1;
14970 assert (PyUnicode_Check(*p_str));
14971 return 0;
14972}
14973
14974static int
14975unicode_format_arg_output(struct unicode_formatter_t *ctx,
14976 struct unicode_format_arg_t *arg,
14977 PyObject *str)
14978{
14979 Py_ssize_t len;
14980 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014981 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020014982 Py_ssize_t pindex;
14983 Py_UCS4 signchar;
14984 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014985 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014986 Py_ssize_t sublen;
14987 _PyUnicodeWriter *writer = &ctx->writer;
14988 Py_UCS4 fill;
14989
14990 fill = ' ';
14991 if (arg->sign && arg->flags & F_ZERO)
14992 fill = '0';
14993
14994 if (PyUnicode_READY(str) == -1)
14995 return -1;
14996
14997 len = PyUnicode_GET_LENGTH(str);
14998 if ((arg->width == -1 || arg->width <= len)
14999 && (arg->prec == -1 || arg->prec >= len)
15000 && !(arg->flags & (F_SIGN | F_BLANK)))
15001 {
15002 /* Fast path */
15003 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15004 return -1;
15005 return 0;
15006 }
15007
15008 /* Truncate the string for "s", "r" and "a" formats
15009 if the precision is set */
15010 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15011 if (arg->prec >= 0 && len > arg->prec)
15012 len = arg->prec;
15013 }
15014
15015 /* Adjust sign and width */
15016 kind = PyUnicode_KIND(str);
15017 pbuf = PyUnicode_DATA(str);
15018 pindex = 0;
15019 signchar = '\0';
15020 if (arg->sign) {
15021 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15022 if (ch == '-' || ch == '+') {
15023 signchar = ch;
15024 len--;
15025 pindex++;
15026 }
15027 else if (arg->flags & F_SIGN)
15028 signchar = '+';
15029 else if (arg->flags & F_BLANK)
15030 signchar = ' ';
15031 else
15032 arg->sign = 0;
15033 }
15034 if (arg->width < len)
15035 arg->width = len;
15036
15037 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015038 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015039 if (!(arg->flags & F_LJUST)) {
15040 if (arg->sign) {
15041 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015042 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015043 }
15044 else {
15045 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015046 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015047 }
15048 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015049 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15050 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015051 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015052 }
15053
Victor Stinnera47082312012-10-04 02:19:54 +020015054 buflen = arg->width;
15055 if (arg->sign && len == arg->width)
15056 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015057 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015058 return -1;
15059
15060 /* Write the sign if needed */
15061 if (arg->sign) {
15062 if (fill != ' ') {
15063 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15064 writer->pos += 1;
15065 }
15066 if (arg->width > len)
15067 arg->width--;
15068 }
15069
15070 /* Write the numeric prefix for "x", "X" and "o" formats
15071 if the alternate form is used.
15072 For example, write "0x" for the "%#x" format. */
15073 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15074 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15075 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15076 if (fill != ' ') {
15077 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15078 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15079 writer->pos += 2;
15080 pindex += 2;
15081 }
15082 arg->width -= 2;
15083 if (arg->width < 0)
15084 arg->width = 0;
15085 len -= 2;
15086 }
15087
15088 /* Pad left with the fill character if needed */
15089 if (arg->width > len && !(arg->flags & F_LJUST)) {
15090 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015091 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015092 writer->pos += sublen;
15093 arg->width = len;
15094 }
15095
15096 /* If padding with spaces: write sign if needed and/or numeric prefix if
15097 the alternate form is used */
15098 if (fill == ' ') {
15099 if (arg->sign) {
15100 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15101 writer->pos += 1;
15102 }
15103 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15104 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15105 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15106 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15107 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15108 writer->pos += 2;
15109 pindex += 2;
15110 }
15111 }
15112
15113 /* Write characters */
15114 if (len) {
15115 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15116 str, pindex, len);
15117 writer->pos += len;
15118 }
15119
15120 /* Pad right with the fill character if needed */
15121 if (arg->width > len) {
15122 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015123 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015124 writer->pos += sublen;
15125 }
15126 return 0;
15127}
15128
15129/* Helper of PyUnicode_Format(): format one arg.
15130 Return 0 on success, raise an exception and return -1 on error. */
15131static int
15132unicode_format_arg(struct unicode_formatter_t *ctx)
15133{
15134 struct unicode_format_arg_t arg;
15135 PyObject *str;
15136 int ret;
15137
Victor Stinner8dbd4212012-12-04 09:30:24 +010015138 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015139 if (arg.ch == '%') {
15140 ctx->fmtpos++;
15141 ctx->fmtcnt--;
15142 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15143 return -1;
15144 return 0;
15145 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015146 arg.flags = 0;
15147 arg.width = -1;
15148 arg.prec = -1;
15149 arg.sign = 0;
15150 str = NULL;
15151
Victor Stinnera47082312012-10-04 02:19:54 +020015152 ret = unicode_format_arg_parse(ctx, &arg);
15153 if (ret == -1)
15154 return -1;
15155
15156 ret = unicode_format_arg_format(ctx, &arg, &str);
15157 if (ret == -1)
15158 return -1;
15159
15160 if (ret != 1) {
15161 ret = unicode_format_arg_output(ctx, &arg, str);
15162 Py_DECREF(str);
15163 if (ret == -1)
15164 return -1;
15165 }
15166
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015167 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015168 PyErr_SetString(PyExc_TypeError,
15169 "not all arguments converted during string formatting");
15170 return -1;
15171 }
15172 return 0;
15173}
15174
Alexander Belopolsky40018472011-02-26 01:02:56 +000015175PyObject *
15176PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015177{
Victor Stinnera47082312012-10-04 02:19:54 +020015178 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015179
Guido van Rossumd57fd912000-03-10 22:53:23 +000015180 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015181 PyErr_BadInternalCall();
15182 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015183 }
Victor Stinnera47082312012-10-04 02:19:54 +020015184
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015185 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015186 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015187
15188 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015189 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15190 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15191 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15192 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015193
Victor Stinner8f674cc2013-04-17 23:02:17 +020015194 _PyUnicodeWriter_Init(&ctx.writer);
15195 ctx.writer.min_length = ctx.fmtcnt + 100;
15196 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015197
Guido van Rossumd57fd912000-03-10 22:53:23 +000015198 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015199 ctx.arglen = PyTuple_Size(args);
15200 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015201 }
15202 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015203 ctx.arglen = -1;
15204 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015205 }
Victor Stinnera47082312012-10-04 02:19:54 +020015206 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015207 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015208 ctx.dict = args;
15209 else
15210 ctx.dict = NULL;
15211 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015212
Victor Stinnera47082312012-10-04 02:19:54 +020015213 while (--ctx.fmtcnt >= 0) {
15214 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015215 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015216
15217 nonfmtpos = ctx.fmtpos++;
15218 while (ctx.fmtcnt >= 0 &&
15219 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15220 ctx.fmtpos++;
15221 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015222 }
Victor Stinnera47082312012-10-04 02:19:54 +020015223 if (ctx.fmtcnt < 0) {
15224 ctx.fmtpos--;
15225 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015226 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015227
Victor Stinnercfc4c132013-04-03 01:48:39 +020015228 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15229 nonfmtpos, ctx.fmtpos) < 0)
15230 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015231 }
15232 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015233 ctx.fmtpos++;
15234 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015235 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015236 }
15237 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015238
Victor Stinnera47082312012-10-04 02:19:54 +020015239 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015240 PyErr_SetString(PyExc_TypeError,
15241 "not all arguments converted during string formatting");
15242 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015243 }
15244
Victor Stinnera47082312012-10-04 02:19:54 +020015245 if (ctx.args_owned) {
15246 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015247 }
Victor Stinnera47082312012-10-04 02:19:54 +020015248 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015249
Benjamin Peterson29060642009-01-31 22:14:21 +000015250 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015251 _PyUnicodeWriter_Dealloc(&ctx.writer);
15252 if (ctx.args_owned) {
15253 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015254 }
15255 return NULL;
15256}
15257
Jeremy Hylton938ace62002-07-17 16:30:39 +000015258static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015259unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15260
Tim Peters6d6c1a32001-08-02 04:15:00 +000015261static PyObject *
15262unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15263{
Benjamin Peterson29060642009-01-31 22:14:21 +000015264 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015265 static char *kwlist[] = {"object", "encoding", "errors", 0};
15266 char *encoding = NULL;
15267 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015268
Benjamin Peterson14339b62009-01-31 16:36:08 +000015269 if (type != &PyUnicode_Type)
15270 return unicode_subtype_new(type, args, kwds);
15271 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015272 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 return NULL;
15274 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015275 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015276 if (encoding == NULL && errors == NULL)
15277 return PyObject_Str(x);
15278 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015279 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015280}
15281
Guido van Rossume023fe02001-08-30 03:12:59 +000015282static PyObject *
15283unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15284{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015285 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015286 Py_ssize_t length, char_size;
15287 int share_wstr, share_utf8;
15288 unsigned int kind;
15289 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015290
Benjamin Peterson14339b62009-01-31 16:36:08 +000015291 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015292
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015293 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015294 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015295 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015296 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015297 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015298 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015299 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015300 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015301
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015302 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015303 if (self == NULL) {
15304 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015305 return NULL;
15306 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015307 kind = PyUnicode_KIND(unicode);
15308 length = PyUnicode_GET_LENGTH(unicode);
15309
15310 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015311#ifdef Py_DEBUG
15312 _PyUnicode_HASH(self) = -1;
15313#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015314 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015315#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015316 _PyUnicode_STATE(self).interned = 0;
15317 _PyUnicode_STATE(self).kind = kind;
15318 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015319 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015320 _PyUnicode_STATE(self).ready = 1;
15321 _PyUnicode_WSTR(self) = NULL;
15322 _PyUnicode_UTF8_LENGTH(self) = 0;
15323 _PyUnicode_UTF8(self) = NULL;
15324 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015325 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015326
15327 share_utf8 = 0;
15328 share_wstr = 0;
15329 if (kind == PyUnicode_1BYTE_KIND) {
15330 char_size = 1;
15331 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15332 share_utf8 = 1;
15333 }
15334 else if (kind == PyUnicode_2BYTE_KIND) {
15335 char_size = 2;
15336 if (sizeof(wchar_t) == 2)
15337 share_wstr = 1;
15338 }
15339 else {
15340 assert(kind == PyUnicode_4BYTE_KIND);
15341 char_size = 4;
15342 if (sizeof(wchar_t) == 4)
15343 share_wstr = 1;
15344 }
15345
15346 /* Ensure we won't overflow the length. */
15347 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15348 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015349 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015350 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015351 data = PyObject_MALLOC((length + 1) * char_size);
15352 if (data == NULL) {
15353 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015354 goto onError;
15355 }
15356
Victor Stinnerc3c74152011-10-02 20:39:55 +020015357 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015358 if (share_utf8) {
15359 _PyUnicode_UTF8_LENGTH(self) = length;
15360 _PyUnicode_UTF8(self) = data;
15361 }
15362 if (share_wstr) {
15363 _PyUnicode_WSTR_LENGTH(self) = length;
15364 _PyUnicode_WSTR(self) = (wchar_t *)data;
15365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015366
Christian Heimesf051e432016-09-13 20:22:02 +020015367 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015368 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015369 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015370#ifdef Py_DEBUG
15371 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15372#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015373 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015374 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015375
15376onError:
15377 Py_DECREF(unicode);
15378 Py_DECREF(self);
15379 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015380}
15381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015382PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015383"str(object='') -> str\n\
15384str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015385\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015386Create a new string object from the given object. If encoding or\n\
15387errors is specified, then the object must expose a data buffer\n\
15388that will be decoded using the given encoding and error handler.\n\
15389Otherwise, returns the result of object.__str__() (if defined)\n\
15390or repr(object).\n\
15391encoding defaults to sys.getdefaultencoding().\n\
15392errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015393
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015394static PyObject *unicode_iter(PyObject *seq);
15395
Guido van Rossumd57fd912000-03-10 22:53:23 +000015396PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015397 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015398 "str", /* tp_name */
15399 sizeof(PyUnicodeObject), /* tp_basicsize */
15400 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015401 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015402 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015403 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015404 0, /* tp_getattr */
15405 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015406 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015407 unicode_repr, /* tp_repr */
15408 &unicode_as_number, /* tp_as_number */
15409 &unicode_as_sequence, /* tp_as_sequence */
15410 &unicode_as_mapping, /* tp_as_mapping */
15411 (hashfunc) unicode_hash, /* tp_hash*/
15412 0, /* tp_call*/
15413 (reprfunc) unicode_str, /* tp_str */
15414 PyObject_GenericGetAttr, /* tp_getattro */
15415 0, /* tp_setattro */
15416 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015418 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15419 unicode_doc, /* tp_doc */
15420 0, /* tp_traverse */
15421 0, /* tp_clear */
15422 PyUnicode_RichCompare, /* tp_richcompare */
15423 0, /* tp_weaklistoffset */
15424 unicode_iter, /* tp_iter */
15425 0, /* tp_iternext */
15426 unicode_methods, /* tp_methods */
15427 0, /* tp_members */
15428 0, /* tp_getset */
15429 &PyBaseObject_Type, /* tp_base */
15430 0, /* tp_dict */
15431 0, /* tp_descr_get */
15432 0, /* tp_descr_set */
15433 0, /* tp_dictoffset */
15434 0, /* tp_init */
15435 0, /* tp_alloc */
15436 unicode_new, /* tp_new */
15437 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015438};
15439
15440/* Initialize the Unicode implementation */
15441
Victor Stinner331a6a52019-05-27 16:39:22 +020015442PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015443_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015444{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015445 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015446 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015447 0x000A, /* LINE FEED */
15448 0x000D, /* CARRIAGE RETURN */
15449 0x001C, /* FILE SEPARATOR */
15450 0x001D, /* GROUP SEPARATOR */
15451 0x001E, /* RECORD SEPARATOR */
15452 0x0085, /* NEXT LINE */
15453 0x2028, /* LINE SEPARATOR */
15454 0x2029, /* PARAGRAPH SEPARATOR */
15455 };
15456
Fred Drakee4315f52000-05-09 19:53:39 +000015457 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015458 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015459 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015460 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015461 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015462 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015463
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015464 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015465 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015466 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015467
15468 /* initialize the linebreak bloom filter */
15469 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015470 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015471 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015472
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015473 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015474 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015475 }
15476 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015477 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015478 }
15479 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015480 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015481 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015482 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015483}
15484
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015485
Walter Dörwald16807132007-05-25 13:52:07 +000015486void
15487PyUnicode_InternInPlace(PyObject **p)
15488{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015489 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015490 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015491#ifdef Py_DEBUG
15492 assert(s != NULL);
15493 assert(_PyUnicode_CHECK(s));
15494#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015495 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015496 return;
15497#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015498 /* If it's a subclass, we don't really know what putting
15499 it in the interned dict might do. */
15500 if (!PyUnicode_CheckExact(s))
15501 return;
15502 if (PyUnicode_CHECK_INTERNED(s))
15503 return;
15504 if (interned == NULL) {
15505 interned = PyDict_New();
15506 if (interned == NULL) {
15507 PyErr_Clear(); /* Don't leave an exception */
15508 return;
15509 }
15510 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015511 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015512 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015513 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015514 if (t == NULL) {
15515 PyErr_Clear();
15516 return;
15517 }
15518 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015519 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015520 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015521 return;
15522 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015523 /* The two references in interned are not counted by refcnt.
15524 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015525 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015526 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015527}
15528
15529void
15530PyUnicode_InternImmortal(PyObject **p)
15531{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015532 PyUnicode_InternInPlace(p);
15533 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015534 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015535 Py_INCREF(*p);
15536 }
Walter Dörwald16807132007-05-25 13:52:07 +000015537}
15538
15539PyObject *
15540PyUnicode_InternFromString(const char *cp)
15541{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015542 PyObject *s = PyUnicode_FromString(cp);
15543 if (s == NULL)
15544 return NULL;
15545 PyUnicode_InternInPlace(&s);
15546 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015547}
15548
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015549
15550#if defined(WITH_VALGRIND) || defined(__INSURE__)
15551static void
15552unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015553{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015554 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015555 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015556 }
15557 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015558 if (keys == NULL || !PyList_Check(keys)) {
15559 PyErr_Clear();
15560 return;
15561 }
Walter Dörwald16807132007-05-25 13:52:07 +000015562
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015563 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015564 detector, interned unicode strings are not forcibly deallocated;
15565 rather, we give them their stolen references back, and then clear
15566 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015567
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015568 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015569#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015570 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015571 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015572
15573 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015574#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015575 for (Py_ssize_t i = 0; i < n; i++) {
15576 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015577 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015578 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015580 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015581 case SSTATE_INTERNED_IMMORTAL:
15582 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015583#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015584 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015585#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015586 break;
15587 case SSTATE_INTERNED_MORTAL:
15588 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015589#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015590 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015591#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015592 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015593 case SSTATE_NOT_INTERNED:
15594 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015595 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015596 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015598 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015599 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015600#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015601 fprintf(stderr, "total size of all interned strings: "
15602 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15603 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015604#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015605 Py_DECREF(keys);
15606 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015607 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015608}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015609#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015610
15611
15612/********************* Unicode Iterator **************************/
15613
15614typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015615 PyObject_HEAD
15616 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015617 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015618} unicodeiterobject;
15619
15620static void
15621unicodeiter_dealloc(unicodeiterobject *it)
15622{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015623 _PyObject_GC_UNTRACK(it);
15624 Py_XDECREF(it->it_seq);
15625 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015626}
15627
15628static int
15629unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15630{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015631 Py_VISIT(it->it_seq);
15632 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015633}
15634
15635static PyObject *
15636unicodeiter_next(unicodeiterobject *it)
15637{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015638 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015639
Benjamin Peterson14339b62009-01-31 16:36:08 +000015640 assert(it != NULL);
15641 seq = it->it_seq;
15642 if (seq == NULL)
15643 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015644 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015646 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15647 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015648 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015649 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15650 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015651 if (item != NULL)
15652 ++it->it_index;
15653 return item;
15654 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015655
Benjamin Peterson14339b62009-01-31 16:36:08 +000015656 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015657 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015658 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015659}
15660
15661static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015662unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015663{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015664 Py_ssize_t len = 0;
15665 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015666 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015667 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015668}
15669
15670PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15671
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015672static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015673unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015674{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015675 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015676 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015677 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015678 it->it_seq, it->it_index);
15679 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015680 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015681 if (u == NULL)
15682 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015683 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015684 }
15685}
15686
15687PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15688
15689static PyObject *
15690unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15691{
15692 Py_ssize_t index = PyLong_AsSsize_t(state);
15693 if (index == -1 && PyErr_Occurred())
15694 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015695 if (it->it_seq != NULL) {
15696 if (index < 0)
15697 index = 0;
15698 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15699 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15700 it->it_index = index;
15701 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015702 Py_RETURN_NONE;
15703}
15704
15705PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15706
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015707static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015708 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015709 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015710 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15711 reduce_doc},
15712 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15713 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015714 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015715};
15716
15717PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015718 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15719 "str_iterator", /* tp_name */
15720 sizeof(unicodeiterobject), /* tp_basicsize */
15721 0, /* tp_itemsize */
15722 /* methods */
15723 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015724 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015725 0, /* tp_getattr */
15726 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015727 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015728 0, /* tp_repr */
15729 0, /* tp_as_number */
15730 0, /* tp_as_sequence */
15731 0, /* tp_as_mapping */
15732 0, /* tp_hash */
15733 0, /* tp_call */
15734 0, /* tp_str */
15735 PyObject_GenericGetAttr, /* tp_getattro */
15736 0, /* tp_setattro */
15737 0, /* tp_as_buffer */
15738 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15739 0, /* tp_doc */
15740 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15741 0, /* tp_clear */
15742 0, /* tp_richcompare */
15743 0, /* tp_weaklistoffset */
15744 PyObject_SelfIter, /* tp_iter */
15745 (iternextfunc)unicodeiter_next, /* tp_iternext */
15746 unicodeiter_methods, /* tp_methods */
15747 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015748};
15749
15750static PyObject *
15751unicode_iter(PyObject *seq)
15752{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015753 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015754
Benjamin Peterson14339b62009-01-31 16:36:08 +000015755 if (!PyUnicode_Check(seq)) {
15756 PyErr_BadInternalCall();
15757 return NULL;
15758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015759 if (PyUnicode_READY(seq) == -1)
15760 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015761 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15762 if (it == NULL)
15763 return NULL;
15764 it->it_index = 0;
15765 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015766 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015767 _PyObject_GC_TRACK(it);
15768 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015769}
15770
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015771
15772size_t
15773Py_UNICODE_strlen(const Py_UNICODE *u)
15774{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015775 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015776}
15777
15778Py_UNICODE*
15779Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15780{
15781 Py_UNICODE *u = s1;
15782 while ((*u++ = *s2++));
15783 return s1;
15784}
15785
15786Py_UNICODE*
15787Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15788{
15789 Py_UNICODE *u = s1;
15790 while ((*u++ = *s2++))
15791 if (n-- == 0)
15792 break;
15793 return s1;
15794}
15795
15796Py_UNICODE*
15797Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15798{
15799 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015800 u1 += wcslen(u1);
15801 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015802 return s1;
15803}
15804
15805int
15806Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15807{
15808 while (*s1 && *s2 && *s1 == *s2)
15809 s1++, s2++;
15810 if (*s1 && *s2)
15811 return (*s1 < *s2) ? -1 : +1;
15812 if (*s1)
15813 return 1;
15814 if (*s2)
15815 return -1;
15816 return 0;
15817}
15818
15819int
15820Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15821{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015822 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015823 for (; n != 0; n--) {
15824 u1 = *s1;
15825 u2 = *s2;
15826 if (u1 != u2)
15827 return (u1 < u2) ? -1 : +1;
15828 if (u1 == '\0')
15829 return 0;
15830 s1++;
15831 s2++;
15832 }
15833 return 0;
15834}
15835
15836Py_UNICODE*
15837Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15838{
15839 const Py_UNICODE *p;
15840 for (p = s; *p; p++)
15841 if (*p == c)
15842 return (Py_UNICODE*)p;
15843 return NULL;
15844}
15845
15846Py_UNICODE*
15847Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15848{
15849 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015850 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015851 while (p != s) {
15852 p--;
15853 if (*p == c)
15854 return (Py_UNICODE*)p;
15855 }
15856 return NULL;
15857}
Victor Stinner331ea922010-08-10 16:37:20 +000015858
Victor Stinner71133ff2010-09-01 23:43:53 +000015859Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015860PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015861{
Victor Stinner577db2c2011-10-11 22:12:48 +020015862 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015863 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015865 if (!PyUnicode_Check(unicode)) {
15866 PyErr_BadArgument();
15867 return NULL;
15868 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015869 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015870 if (u == NULL)
15871 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015872 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015873 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015874 PyErr_NoMemory();
15875 return NULL;
15876 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015877 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015878 size *= sizeof(Py_UNICODE);
15879 copy = PyMem_Malloc(size);
15880 if (copy == NULL) {
15881 PyErr_NoMemory();
15882 return NULL;
15883 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015884 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015885 return copy;
15886}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015887
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015888
Victor Stinner709d23d2019-05-02 14:56:30 -040015889static int
15890encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015891{
Victor Stinner709d23d2019-05-02 14:56:30 -040015892 int res;
15893 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15894 if (res == -2) {
15895 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15896 return -1;
15897 }
15898 if (res < 0) {
15899 PyErr_NoMemory();
15900 return -1;
15901 }
15902 return 0;
15903}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015904
Victor Stinner709d23d2019-05-02 14:56:30 -040015905
15906static int
15907config_get_codec_name(wchar_t **config_encoding)
15908{
15909 char *encoding;
15910 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15911 return -1;
15912 }
15913
15914 PyObject *name_obj = NULL;
15915 PyObject *codec = _PyCodec_Lookup(encoding);
15916 PyMem_RawFree(encoding);
15917
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015918 if (!codec)
15919 goto error;
15920
15921 name_obj = PyObject_GetAttrString(codec, "name");
15922 Py_CLEAR(codec);
15923 if (!name_obj) {
15924 goto error;
15925 }
15926
Victor Stinner709d23d2019-05-02 14:56:30 -040015927 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15928 Py_DECREF(name_obj);
15929 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015930 goto error;
15931 }
15932
Victor Stinner709d23d2019-05-02 14:56:30 -040015933 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15934 if (raw_wname == NULL) {
15935 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015936 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015937 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015938 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015939
15940 PyMem_RawFree(*config_encoding);
15941 *config_encoding = raw_wname;
15942
15943 PyMem_Free(wname);
15944 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015945
15946error:
15947 Py_XDECREF(codec);
15948 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015949 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015950}
15951
15952
Victor Stinner331a6a52019-05-27 16:39:22 +020015953static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015954init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015955{
Victor Stinner709d23d2019-05-02 14:56:30 -040015956 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020015957 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040015958 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015959 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015960 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015961 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015962 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015963}
15964
15965
Victor Stinner709d23d2019-05-02 14:56:30 -040015966static int
15967init_fs_codec(PyInterpreterState *interp)
15968{
Victor Stinnerda7933e2020-04-13 03:04:28 +020015969 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040015970
15971 _Py_error_handler error_handler;
15972 error_handler = get_error_handler_wide(config->filesystem_errors);
15973 if (error_handler == _Py_ERROR_UNKNOWN) {
15974 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15975 return -1;
15976 }
15977
15978 char *encoding, *errors;
15979 if (encode_wstr_utf8(config->filesystem_encoding,
15980 &encoding,
15981 "filesystem_encoding") < 0) {
15982 return -1;
15983 }
15984
15985 if (encode_wstr_utf8(config->filesystem_errors,
15986 &errors,
15987 "filesystem_errors") < 0) {
15988 PyMem_RawFree(encoding);
15989 return -1;
15990 }
15991
15992 PyMem_RawFree(interp->fs_codec.encoding);
15993 interp->fs_codec.encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015994 /* encoding has been normalized by init_fs_encoding() */
15995 interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
Victor Stinner709d23d2019-05-02 14:56:30 -040015996 PyMem_RawFree(interp->fs_codec.errors);
15997 interp->fs_codec.errors = errors;
15998 interp->fs_codec.error_handler = error_handler;
15999
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016000#ifdef _Py_FORCE_UTF8_FS_ENCODING
16001 assert(interp->fs_codec.utf8 == 1);
16002#endif
16003
Victor Stinner709d23d2019-05-02 14:56:30 -040016004 /* At this point, PyUnicode_EncodeFSDefault() and
16005 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16006 the C implementation of the filesystem encoding. */
16007
16008 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16009 global configuration variables. */
16010 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
16011 interp->fs_codec.errors) < 0) {
16012 PyErr_NoMemory();
16013 return -1;
16014 }
16015 return 0;
16016}
16017
16018
Victor Stinner331a6a52019-05-27 16:39:22 +020016019static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016020init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016021{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016022 PyInterpreterState *interp = tstate->interp;
16023
Victor Stinner709d23d2019-05-02 14:56:30 -040016024 /* Update the filesystem encoding to the normalized Python codec name.
16025 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16026 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016027 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016028 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016029 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016030 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016031 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016032 }
16033
Victor Stinner709d23d2019-05-02 14:56:30 -040016034 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016035 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016036 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016037 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016038}
16039
16040
Victor Stinner331a6a52019-05-27 16:39:22 +020016041PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016042_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016043{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016044 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016045 if (_PyStatus_EXCEPTION(status)) {
16046 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016047 }
16048
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016049 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016050}
16051
16052
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016053static void
16054_PyUnicode_FiniEncodings(PyThreadState *tstate)
16055{
16056 PyInterpreterState *interp = tstate->interp;
16057 PyMem_RawFree(interp->fs_codec.encoding);
16058 interp->fs_codec.encoding = NULL;
16059 interp->fs_codec.utf8 = 0;
16060 PyMem_RawFree(interp->fs_codec.errors);
16061 interp->fs_codec.errors = NULL;
16062 interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
16063}
16064
16065
Victor Stinner709d23d2019-05-02 14:56:30 -040016066#ifdef MS_WINDOWS
16067int
16068_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16069{
Victor Stinner81a7be32020-04-14 15:14:01 +020016070 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016071 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016072
16073 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16074 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16075 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16076 if (encoding == NULL || errors == NULL) {
16077 PyMem_RawFree(encoding);
16078 PyMem_RawFree(errors);
16079 PyErr_NoMemory();
16080 return -1;
16081 }
16082
16083 PyMem_RawFree(config->filesystem_encoding);
16084 config->filesystem_encoding = encoding;
16085 PyMem_RawFree(config->filesystem_errors);
16086 config->filesystem_errors = errors;
16087
16088 return init_fs_codec(interp);
16089}
16090#endif
16091
16092
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016093void
Victor Stinner3d483342019-11-22 12:27:50 +010016094_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016095{
Victor Stinner3d483342019-11-22 12:27:50 +010016096 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016097#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016098 /* Insure++ is a memory analysis tool that aids in discovering
16099 * memory leaks and other memory problems. On Python exit, the
16100 * interned string dictionaries are flagged as being in use at exit
16101 * (which it is). Under normal circumstances, this is fine because
16102 * the memory will be automatically reclaimed by the system. Under
16103 * memory debugging, it's a huge source of useless noise, so we
16104 * trade off slower shutdown for less distraction in the memory
16105 * reports. -baw
16106 */
16107 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016108#endif /* __INSURE__ */
16109
Victor Stinner3d483342019-11-22 12:27:50 +010016110 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016111
Victor Stinner3d483342019-11-22 12:27:50 +010016112 for (Py_ssize_t i = 0; i < 256; i++) {
16113 Py_CLEAR(unicode_latin1[i]);
16114 }
16115 _PyUnicode_ClearStaticStrings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016116 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016117
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016118 _PyUnicode_FiniEncodings(tstate);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016119}
16120
16121
Georg Brandl66c221e2010-10-14 07:04:07 +000016122/* A _string module, to export formatter_parser and formatter_field_name_split
16123 to the string.Formatter class implemented in Python. */
16124
16125static PyMethodDef _string_methods[] = {
16126 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16127 METH_O, PyDoc_STR("split the argument as a field name")},
16128 {"formatter_parser", (PyCFunction) formatter_parser,
16129 METH_O, PyDoc_STR("parse the argument as a format string")},
16130 {NULL, NULL}
16131};
16132
16133static struct PyModuleDef _string_module = {
16134 PyModuleDef_HEAD_INIT,
16135 "_string",
16136 PyDoc_STR("string helper module"),
16137 0,
16138 _string_methods,
16139 NULL,
16140 NULL,
16141 NULL,
16142 NULL
16143};
16144
16145PyMODINIT_FUNC
16146PyInit__string(void)
16147{
16148 return PyModule_Create(&_string_module);
16149}
16150
16151
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016152#ifdef __cplusplus
16153}
16154#endif