blob: e4235b1aca3cf6f21db5faa486b5cd2bf04e4f42 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020047#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinnerbcda8f12018-11-21 22:27:47 +010048#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020049#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040050#include "pycore_pylifecycle.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020051#include "pycore_pystate.h" // _PyInterpreterState_GET()
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000052#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070053#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000055#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000056#include <windows.h>
57#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000058
Victor Stinnerfecc4f22019-03-19 14:20:29 +010059/* Uncomment to display statistics on interned strings at exit when
60 using Valgrind or Insecure++. */
61/* #define INTERNED_STATS 1 */
62
63
Larry Hastings61272b72014-01-07 12:41:53 -080064/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090065class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080066[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090067/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69/*[python input]
70class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80[python start generated code]*/
81/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080082
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchaka05997252013-01-26 12:14:02 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner8faf8212011-12-08 22:14:11 +010096/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97#define MAX_UNICODE 0x10ffff
98
Victor Stinner910337b2011-10-03 03:20:16 +020099#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200100# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#else
102# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200104
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105#define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200108 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200113#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200116 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200121#define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900123
124/* Don't use deprecated macro of unicodeobject.h */
125#undef PyUnicode_WSTR_LENGTH
126#define PyUnicode_WSTR_LENGTH(op) \
127 (PyUnicode_IS_COMPACT_ASCII(op) ? \
128 ((PyASCIIObject*)op)->length : \
129 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_WSTR_LENGTH(op) \
131 (((PyCompactUnicodeObject*)(op))->wstr_length)
132#define _PyUnicode_LENGTH(op) \
133 (((PyASCIIObject *)(op))->length)
134#define _PyUnicode_STATE(op) \
135 (((PyASCIIObject *)(op))->state)
136#define _PyUnicode_HASH(op) \
137 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200138#define _PyUnicode_KIND(op) \
139 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200140 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200141#define _PyUnicode_GET_LENGTH(op) \
142 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200143 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200144#define _PyUnicode_DATA_ANY(op) \
145 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200146
Victor Stinner910337b2011-10-03 03:20:16 +0200147#undef PyUnicode_READY
148#define PyUnicode_READY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200151 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100152 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200153
Victor Stinnerc379ead2011-10-03 12:52:27 +0200154#define _PyUnicode_SHARE_UTF8(op) \
155 (assert(_PyUnicode_CHECK(op)), \
156 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
157 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
158#define _PyUnicode_SHARE_WSTR(op) \
159 (assert(_PyUnicode_CHECK(op)), \
160 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
161
Victor Stinner829c0ad2011-10-03 01:08:02 +0200162/* true if the Unicode object has an allocated UTF-8 memory block
163 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200164#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200165 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
168
Victor Stinner03490912011-10-03 23:45:12 +0200169/* true if the Unicode object has an allocated wstr memory block
170 (not shared with other data) */
171#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200172 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200173 (!PyUnicode_IS_READY(op) || \
174 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
175
Victor Stinner910337b2011-10-03 03:20:16 +0200176/* Generic helper macro to convert characters of different types.
177 from_type and to_type have to be valid type names, begin and end
178 are pointers to the source characters which should be of type
179 "from_type *". to is a pointer of type "to_type *" and points to the
180 buffer where the result characters are written to. */
181#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
182 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100183 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600184 const from_type *_iter = (const from_type *)(begin);\
185 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 Py_ssize_t n = (_end) - (_iter); \
187 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200188 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200189 while (_iter < (_unrolled_end)) { \
190 _to[0] = (to_type) _iter[0]; \
191 _to[1] = (to_type) _iter[1]; \
192 _to[2] = (to_type) _iter[2]; \
193 _to[3] = (to_type) _iter[3]; \
194 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200196 while (_iter < (_end)) \
197 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200198 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200199
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200200#ifdef MS_WINDOWS
201 /* On Windows, overallocate by 50% is the best factor */
202# define OVERALLOCATE_FACTOR 2
203#else
204 /* On Linux, overallocate by 25% is the best factor */
205# define OVERALLOCATE_FACTOR 4
206#endif
207
Victor Stinner607b1022020-05-05 18:50:30 +0200208/* bpo-40521: Interned strings are shared by all interpreters. */
209#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
210# define INTERNED_STRINGS
211#endif
212
Walter Dörwald16807132007-05-25 13:52:07 +0000213/* This dictionary holds all interned unicode strings. Note that references
214 to strings in this dictionary are *not* counted in the string's ob_refcnt.
215 When the interned string reaches a refcnt of 0 the string deallocation
216 function will delete the reference from this dictionary.
217
218 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000219 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000220*/
Victor Stinner607b1022020-05-05 18:50:30 +0200221#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200223#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000224
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200225static struct _Py_unicode_state*
226get_unicode_state(void)
227{
228 PyInterpreterState *interp = _PyInterpreterState_GET();
229 return &interp->unicode;
230}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200231
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000232
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200234static inline PyObject* unicode_get_empty(void)
235{
236 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200237 // unicode_get_empty() must not be called before _PyUnicode_Init()
238 // or after _PyUnicode_Fini()
239 assert(state->empty != NULL);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200240 return state->empty;
241}
242
243static inline PyObject* unicode_new_empty(void)
244{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200245 PyObject *empty = unicode_get_empty();
246 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200247 return empty;
248}
249
250#define _Py_RETURN_UNICODE_EMPTY() \
251 do { \
252 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200253 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254
Victor Stinner59423e32018-11-26 13:40:01 +0100255static inline void
256unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
257 Py_ssize_t start, Py_ssize_t length)
258{
259 assert(0 <= start);
260 assert(kind != PyUnicode_WCHAR_KIND);
261 switch (kind) {
262 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100263 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100264 Py_UCS1 ch = (unsigned char)value;
265 Py_UCS1 *to = (Py_UCS1 *)data + start;
266 memset(to, ch, length);
267 break;
268 }
269 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100270 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100271 Py_UCS2 ch = (Py_UCS2)value;
272 Py_UCS2 *to = (Py_UCS2 *)data + start;
273 const Py_UCS2 *end = to + length;
274 for (; to < end; ++to) *to = ch;
275 break;
276 }
277 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100278 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100279 Py_UCS4 ch = value;
280 Py_UCS4 * to = (Py_UCS4 *)data + start;
281 const Py_UCS4 *end = to + length;
282 for (; to < end; ++to) *to = ch;
283 break;
284 }
285 default: Py_UNREACHABLE();
286 }
287}
288
289
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200290/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700291static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200292_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900293static inline void
294_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400295static PyObject *
296unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
297 const char *errors);
298static PyObject *
299unicode_decode_utf8(const char *s, Py_ssize_t size,
300 _Py_error_handler error_handler, const char *errors,
301 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200302
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200303/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200304static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200305
Victor Stinner607b1022020-05-05 18:50:30 +0200306/* bpo-40521: Latin1 singletons are shared by all interpreters. */
307#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
308# define LATIN1_SINGLETONS
309#endif
310
311#ifdef LATIN1_SINGLETONS
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312/* Single character Unicode strings in the Latin-1 range are being
313 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200314static PyObject *unicode_latin1[256] = {NULL};
Victor Stinner607b1022020-05-05 18:50:30 +0200315#endif
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316
Christian Heimes190d79e2008-01-30 11:58:22 +0000317/* Fast detection of the most frequent whitespace characters */
318const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000319 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000320/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000321/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000322/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000323/* case 0x000C: * FORM FEED */
324/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000325 0, 1, 1, 1, 1, 1, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000327/* case 0x001C: * FILE SEPARATOR */
328/* case 0x001D: * GROUP SEPARATOR */
329/* case 0x001E: * RECORD SEPARATOR */
330/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000331 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000332/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000333 1, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0,
336 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000337
Benjamin Peterson14339b62009-01-31 16:36:08 +0000338 0, 0, 0, 0, 0, 0, 0, 0,
339 0, 0, 0, 0, 0, 0, 0, 0,
340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
343 0, 0, 0, 0, 0, 0, 0, 0,
344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000346};
347
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200348/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200349static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200350static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100351static int unicode_modifiable(PyObject *unicode);
352
Victor Stinnerfe226c02011-10-03 03:52:20 +0200353
Alexander Belopolsky40018472011-02-26 01:02:56 +0000354static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100355_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200356static PyObject *
357_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
358static PyObject *
359_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
360
361static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000362unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000363 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100364 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000365 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
366
Alexander Belopolsky40018472011-02-26 01:02:56 +0000367static void
368raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300369 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100370 PyObject *unicode,
371 Py_ssize_t startpos, Py_ssize_t endpos,
372 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000373
Christian Heimes190d79e2008-01-30 11:58:22 +0000374/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200375static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000376 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000377/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000378/* 0x000B, * LINE TABULATION */
379/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000380/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000381 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000382 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000383/* 0x001C, * FILE SEPARATOR */
384/* 0x001D, * GROUP SEPARATOR */
385/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000386 0, 0, 0, 0, 1, 1, 1, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0,
390 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000391
Benjamin Peterson14339b62009-01-31 16:36:08 +0000392 0, 0, 0, 0, 0, 0, 0, 0,
393 0, 0, 0, 0, 0, 0, 0, 0,
394 0, 0, 0, 0, 0, 0, 0, 0,
395 0, 0, 0, 0, 0, 0, 0, 0,
396 0, 0, 0, 0, 0, 0, 0, 0,
397 0, 0, 0, 0, 0, 0, 0, 0,
398 0, 0, 0, 0, 0, 0, 0, 0,
399 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000400};
401
INADA Naoki3ae20562017-01-16 20:41:20 +0900402static int convert_uc(PyObject *obj, void *addr);
403
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300404#include "clinic/unicodeobject.c.h"
405
Victor Stinner3d4226a2018-08-29 22:21:32 +0200406_Py_error_handler
407_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200408{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200409 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200410 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200411 }
412 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200413 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200414 }
415 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200416 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200417 }
418 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200419 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200420 }
421 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200422 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200423 }
424 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200425 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200426 }
427 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200428 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200429 }
Victor Stinner50149202015-09-22 00:26:54 +0200430 return _Py_ERROR_OTHER;
431}
432
Victor Stinner709d23d2019-05-02 14:56:30 -0400433
434static _Py_error_handler
435get_error_handler_wide(const wchar_t *errors)
436{
437 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
438 return _Py_ERROR_STRICT;
439 }
440 if (wcscmp(errors, L"surrogateescape") == 0) {
441 return _Py_ERROR_SURROGATEESCAPE;
442 }
443 if (wcscmp(errors, L"replace") == 0) {
444 return _Py_ERROR_REPLACE;
445 }
446 if (wcscmp(errors, L"ignore") == 0) {
447 return _Py_ERROR_IGNORE;
448 }
449 if (wcscmp(errors, L"backslashreplace") == 0) {
450 return _Py_ERROR_BACKSLASHREPLACE;
451 }
452 if (wcscmp(errors, L"surrogatepass") == 0) {
453 return _Py_ERROR_SURROGATEPASS;
454 }
455 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
456 return _Py_ERROR_XMLCHARREFREPLACE;
457 }
458 return _Py_ERROR_OTHER;
459}
460
461
Victor Stinner22eb6892019-06-26 00:51:05 +0200462static inline int
463unicode_check_encoding_errors(const char *encoding, const char *errors)
464{
465 if (encoding == NULL && errors == NULL) {
466 return 0;
467 }
468
Victor Stinner81a7be32020-04-14 15:14:01 +0200469 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200470#ifndef Py_DEBUG
471 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200472 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200473 return 0;
474 }
475#else
476 /* Always check in debug mode */
477#endif
478
479 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
480 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200481 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200482 return 0;
483 }
484
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200485 /* Disable checks during Python finalization. For example, it allows to
486 call _PyObject_Dump() during finalization for debugging purpose. */
487 if (interp->finalizing) {
488 return 0;
489 }
490
Victor Stinner22eb6892019-06-26 00:51:05 +0200491 if (encoding != NULL) {
492 PyObject *handler = _PyCodec_Lookup(encoding);
493 if (handler == NULL) {
494 return -1;
495 }
496 Py_DECREF(handler);
497 }
498
499 if (errors != NULL) {
500 PyObject *handler = PyCodec_LookupError(errors);
501 if (handler == NULL) {
502 return -1;
503 }
504 Py_DECREF(handler);
505 }
506 return 0;
507}
508
509
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300510/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
511 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000512Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000513PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000514{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000515#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000516 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000517#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000518 /* This is actually an illegal character, so it should
519 not be passed to unichr. */
520 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000521#endif
522}
523
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200524int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100525_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200526{
Victor Stinner68762572019-10-07 18:42:01 +0200527#define CHECK(expr) \
528 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
529
Victor Stinner910337b2011-10-03 03:20:16 +0200530 PyASCIIObject *ascii;
531 unsigned int kind;
532
Victor Stinner68762572019-10-07 18:42:01 +0200533 assert(op != NULL);
534 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200535
536 ascii = (PyASCIIObject *)op;
537 kind = ascii->state.kind;
538
Victor Stinnera3b334d2011-10-03 13:53:37 +0200539 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200540 CHECK(kind == PyUnicode_1BYTE_KIND);
541 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200542 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200543 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200544 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200545 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200546
Victor Stinnera41463c2011-10-04 01:05:08 +0200547 if (ascii->state.compact == 1) {
548 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200549 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200550 || kind == PyUnicode_2BYTE_KIND
551 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200552 CHECK(ascii->state.ascii == 0);
553 CHECK(ascii->state.ready == 1);
554 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100555 }
556 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
558
559 data = unicode->data.any;
560 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(ascii->length == 0);
562 CHECK(ascii->hash == -1);
563 CHECK(ascii->state.compact == 0);
564 CHECK(ascii->state.ascii == 0);
565 CHECK(ascii->state.ready == 0);
566 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
567 CHECK(ascii->wstr != NULL);
568 CHECK(data == NULL);
569 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 }
571 else {
Victor Stinner68762572019-10-07 18:42:01 +0200572 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200573 || kind == PyUnicode_2BYTE_KIND
574 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200575 CHECK(ascii->state.compact == 0);
576 CHECK(ascii->state.ready == 1);
577 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200578 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200579 CHECK(compact->utf8 == data);
580 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200581 }
582 else
Victor Stinner68762572019-10-07 18:42:01 +0200583 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200584 }
585 }
586 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200587 if (
588#if SIZEOF_WCHAR_T == 2
589 kind == PyUnicode_2BYTE_KIND
590#else
591 kind == PyUnicode_4BYTE_KIND
592#endif
593 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200594 {
Victor Stinner68762572019-10-07 18:42:01 +0200595 CHECK(ascii->wstr == data);
596 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200597 } else
Victor Stinner68762572019-10-07 18:42:01 +0200598 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200599 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200600
601 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200602 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200603 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200604 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200605 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200606
607 /* check that the best kind is used: O(n) operation */
608 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200609 Py_ssize_t i;
610 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300611 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200612 Py_UCS4 ch;
613
614 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200615 for (i=0; i < ascii->length; i++)
616 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200617 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200618 if (ch > maxchar)
619 maxchar = ch;
620 }
621 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100622 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200623 CHECK(maxchar >= 128);
624 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100625 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200626 else
Victor Stinner68762572019-10-07 18:42:01 +0200627 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200628 }
Victor Stinner77faf692011-11-20 18:56:05 +0100629 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200630 CHECK(maxchar >= 0x100);
631 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100632 }
633 else {
Victor Stinner68762572019-10-07 18:42:01 +0200634 CHECK(maxchar >= 0x10000);
635 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100636 }
Victor Stinner68762572019-10-07 18:42:01 +0200637 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200638 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400639 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200640
641#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400642}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200643
Victor Stinner910337b2011-10-03 03:20:16 +0200644
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100645static PyObject*
646unicode_result_wchar(PyObject *unicode)
647{
648#ifndef Py_DEBUG
649 Py_ssize_t len;
650
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100651 len = _PyUnicode_WSTR_LENGTH(unicode);
652 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200654 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100655 }
656
657 if (len == 1) {
658 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100659 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100660 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
661 Py_DECREF(unicode);
662 return latin1_char;
663 }
664 }
665
666 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200667 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 return NULL;
669 }
670#else
Victor Stinneraa771272012-10-04 02:32:58 +0200671 assert(Py_REFCNT(unicode) == 1);
672
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100673 /* don't make the result ready in debug mode to ensure that the caller
674 makes the string ready before using it */
675 assert(_PyUnicode_CheckConsistency(unicode, 1));
676#endif
677 return unicode;
678}
679
680static PyObject*
681unicode_result_ready(PyObject *unicode)
682{
683 Py_ssize_t length;
684
685 length = PyUnicode_GET_LENGTH(unicode);
686 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200687 PyObject *empty = unicode_get_empty();
688 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100689 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200690 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100691 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200692 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100693 }
694
Victor Stinner607b1022020-05-05 18:50:30 +0200695#ifdef LATIN1_SINGLETONS
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100696 if (length == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300697 const void *data = PyUnicode_DATA(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +0200698 int kind = PyUnicode_KIND(unicode);
699 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100700 if (ch < 256) {
701 PyObject *latin1_char = unicode_latin1[ch];
702 if (latin1_char != NULL) {
703 if (unicode != latin1_char) {
704 Py_INCREF(latin1_char);
705 Py_DECREF(unicode);
706 }
707 return latin1_char;
708 }
709 else {
710 assert(_PyUnicode_CheckConsistency(unicode, 1));
711 Py_INCREF(unicode);
712 unicode_latin1[ch] = unicode;
713 return unicode;
714 }
715 }
716 }
Victor Stinner607b1022020-05-05 18:50:30 +0200717#endif
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100718
719 assert(_PyUnicode_CheckConsistency(unicode, 1));
720 return unicode;
721}
722
723static PyObject*
724unicode_result(PyObject *unicode)
725{
726 assert(_PyUnicode_CHECK(unicode));
727 if (PyUnicode_IS_READY(unicode))
728 return unicode_result_ready(unicode);
729 else
730 return unicode_result_wchar(unicode);
731}
732
Victor Stinnerc4b49542011-12-11 22:44:26 +0100733static PyObject*
734unicode_result_unchanged(PyObject *unicode)
735{
736 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500737 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100738 return NULL;
739 Py_INCREF(unicode);
740 return unicode;
741 }
742 else
743 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100744 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100745}
746
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200747/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
748 ASCII, Latin1, UTF-8, etc. */
749static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200750backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200751 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
752{
Victor Stinnerad771582015-10-09 12:38:53 +0200753 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200754 Py_UCS4 ch;
755 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300756 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200757
758 assert(PyUnicode_IS_READY(unicode));
759 kind = PyUnicode_KIND(unicode);
760 data = PyUnicode_DATA(unicode);
761
762 size = 0;
763 /* determine replacement size */
764 for (i = collstart; i < collend; ++i) {
765 Py_ssize_t incr;
766
767 ch = PyUnicode_READ(kind, data, i);
768 if (ch < 0x100)
769 incr = 2+2;
770 else if (ch < 0x10000)
771 incr = 2+4;
772 else {
773 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200774 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200775 }
776 if (size > PY_SSIZE_T_MAX - incr) {
777 PyErr_SetString(PyExc_OverflowError,
778 "encoded result is too long for a Python string");
779 return NULL;
780 }
781 size += incr;
782 }
783
Victor Stinnerad771582015-10-09 12:38:53 +0200784 str = _PyBytesWriter_Prepare(writer, str, size);
785 if (str == NULL)
786 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200787
788 /* generate replacement */
789 for (i = collstart; i < collend; ++i) {
790 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200791 *str++ = '\\';
792 if (ch >= 0x00010000) {
793 *str++ = 'U';
794 *str++ = Py_hexdigits[(ch>>28)&0xf];
795 *str++ = Py_hexdigits[(ch>>24)&0xf];
796 *str++ = Py_hexdigits[(ch>>20)&0xf];
797 *str++ = Py_hexdigits[(ch>>16)&0xf];
798 *str++ = Py_hexdigits[(ch>>12)&0xf];
799 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200800 }
Victor Stinner797485e2015-10-09 03:17:30 +0200801 else if (ch >= 0x100) {
802 *str++ = 'u';
803 *str++ = Py_hexdigits[(ch>>12)&0xf];
804 *str++ = Py_hexdigits[(ch>>8)&0xf];
805 }
806 else
807 *str++ = 'x';
808 *str++ = Py_hexdigits[(ch>>4)&0xf];
809 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200810 }
811 return str;
812}
813
814/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
815 ASCII, Latin1, UTF-8, etc. */
816static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200817xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200818 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
819{
Victor Stinnerad771582015-10-09 12:38:53 +0200820 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200821 Py_UCS4 ch;
822 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300823 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200824
825 assert(PyUnicode_IS_READY(unicode));
826 kind = PyUnicode_KIND(unicode);
827 data = PyUnicode_DATA(unicode);
828
829 size = 0;
830 /* determine replacement size */
831 for (i = collstart; i < collend; ++i) {
832 Py_ssize_t incr;
833
834 ch = PyUnicode_READ(kind, data, i);
835 if (ch < 10)
836 incr = 2+1+1;
837 else if (ch < 100)
838 incr = 2+2+1;
839 else if (ch < 1000)
840 incr = 2+3+1;
841 else if (ch < 10000)
842 incr = 2+4+1;
843 else if (ch < 100000)
844 incr = 2+5+1;
845 else if (ch < 1000000)
846 incr = 2+6+1;
847 else {
848 assert(ch <= MAX_UNICODE);
849 incr = 2+7+1;
850 }
851 if (size > PY_SSIZE_T_MAX - incr) {
852 PyErr_SetString(PyExc_OverflowError,
853 "encoded result is too long for a Python string");
854 return NULL;
855 }
856 size += incr;
857 }
858
Victor Stinnerad771582015-10-09 12:38:53 +0200859 str = _PyBytesWriter_Prepare(writer, str, size);
860 if (str == NULL)
861 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200862
863 /* generate replacement */
864 for (i = collstart; i < collend; ++i) {
865 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
866 }
867 return str;
868}
869
Thomas Wouters477c8d52006-05-27 19:21:47 +0000870/* --- Bloom Filters ----------------------------------------------------- */
871
872/* stuff to implement simple "bloom filters" for Unicode characters.
873 to keep things simple, we use a single bitmask, using the least 5
874 bits from each unicode characters as the bit index. */
875
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200876/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877
Antoine Pitrouf068f942010-01-13 14:19:12 +0000878#if LONG_BIT >= 128
879#define BLOOM_WIDTH 128
880#elif LONG_BIT >= 64
881#define BLOOM_WIDTH 64
882#elif LONG_BIT >= 32
883#define BLOOM_WIDTH 32
884#else
885#error "LONG_BIT is smaller than 32"
886#endif
887
Thomas Wouters477c8d52006-05-27 19:21:47 +0000888#define BLOOM_MASK unsigned long
889
Serhiy Storchaka05997252013-01-26 12:14:02 +0200890static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000891
Antoine Pitrouf068f942010-01-13 14:19:12 +0000892#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000893
Benjamin Peterson29060642009-01-31 22:14:21 +0000894#define BLOOM_LINEBREAK(ch) \
895 ((ch) < 128U ? ascii_linebreak[(ch)] : \
896 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000897
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700898static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300899make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000900{
Victor Stinnera85af502013-04-09 21:53:54 +0200901#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
902 do { \
903 TYPE *data = (TYPE *)PTR; \
904 TYPE *end = data + LEN; \
905 Py_UCS4 ch; \
906 for (; data != end; data++) { \
907 ch = *data; \
908 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
909 } \
910 break; \
911 } while (0)
912
Thomas Wouters477c8d52006-05-27 19:21:47 +0000913 /* calculate simple bloom-style bitmask for a given unicode string */
914
Antoine Pitrouf068f942010-01-13 14:19:12 +0000915 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000916
917 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200918 switch (kind) {
919 case PyUnicode_1BYTE_KIND:
920 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
921 break;
922 case PyUnicode_2BYTE_KIND:
923 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
924 break;
925 case PyUnicode_4BYTE_KIND:
926 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
927 break;
928 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700929 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200930 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000931 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200932
933#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000934}
935
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300936static int
937ensure_unicode(PyObject *obj)
938{
939 if (!PyUnicode_Check(obj)) {
940 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200941 "must be str, not %.100s",
942 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300943 return -1;
944 }
945 return PyUnicode_READY(obj);
946}
947
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200948/* Compilation of templated routines */
949
Victor Stinner90ed8a62020-06-24 00:34:07 +0200950#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200951
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200952#include "stringlib/asciilib.h"
953#include "stringlib/fastsearch.h"
954#include "stringlib/partition.h"
955#include "stringlib/split.h"
956#include "stringlib/count.h"
957#include "stringlib/find.h"
958#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200959#include "stringlib/undef.h"
960
961#include "stringlib/ucs1lib.h"
962#include "stringlib/fastsearch.h"
963#include "stringlib/partition.h"
964#include "stringlib/split.h"
965#include "stringlib/count.h"
966#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300967#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200968#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200969#include "stringlib/undef.h"
970
971#include "stringlib/ucs2lib.h"
972#include "stringlib/fastsearch.h"
973#include "stringlib/partition.h"
974#include "stringlib/split.h"
975#include "stringlib/count.h"
976#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300977#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200978#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200979#include "stringlib/undef.h"
980
981#include "stringlib/ucs4lib.h"
982#include "stringlib/fastsearch.h"
983#include "stringlib/partition.h"
984#include "stringlib/split.h"
985#include "stringlib/count.h"
986#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300987#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200988#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200989#include "stringlib/undef.h"
990
Inada Naoki2c4928d2020-06-17 20:09:44 +0900991_Py_COMP_DIAG_PUSH
992_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200993#include "stringlib/unicodedefs.h"
994#include "stringlib/fastsearch.h"
995#include "stringlib/count.h"
996#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100997#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900998_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200999
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001000#undef STRINGLIB_GET_EMPTY
1001
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002/* --- Unicode Object ----------------------------------------------------- */
1003
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07001004static inline Py_ssize_t
1005findchar(const void *s, int kind,
1006 Py_ssize_t size, Py_UCS4 ch,
1007 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001008{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001009 switch (kind) {
1010 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001011 if ((Py_UCS1) ch != ch)
1012 return -1;
1013 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001014 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001015 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001016 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001017 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001018 if ((Py_UCS2) ch != ch)
1019 return -1;
1020 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001021 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001022 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001023 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001024 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001025 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001026 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001027 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001028 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001029 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001030 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001032}
1033
Victor Stinnerafffce42012-10-03 23:03:17 +02001034#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001035/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001036 earlier.
1037
1038 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1039 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1040 invalid character in Unicode 6.0. */
1041static void
1042unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1043{
1044 int kind = PyUnicode_KIND(unicode);
1045 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1046 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1047 if (length <= old_length)
1048 return;
1049 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1050}
1051#endif
1052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053static PyObject*
1054resize_compact(PyObject *unicode, Py_ssize_t length)
1055{
1056 Py_ssize_t char_size;
1057 Py_ssize_t struct_size;
1058 Py_ssize_t new_size;
1059 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001060 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001061#ifdef Py_DEBUG
1062 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1063#endif
1064
Victor Stinner79891572012-05-03 13:43:07 +02001065 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001066 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001067 assert(PyUnicode_IS_COMPACT(unicode));
1068
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001069 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001070 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071 struct_size = sizeof(PyASCIIObject);
1072 else
1073 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001074 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001075
Victor Stinnerfe226c02011-10-03 03:52:20 +02001076 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1077 PyErr_NoMemory();
1078 return NULL;
1079 }
1080 new_size = (struct_size + (length + 1) * char_size);
1081
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001082 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1083 PyObject_DEL(_PyUnicode_UTF8(unicode));
1084 _PyUnicode_UTF8(unicode) = NULL;
1085 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1086 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001087#ifdef Py_REF_DEBUG
1088 _Py_RefTotal--;
1089#endif
1090#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001091 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001092#endif
Victor Stinner84def372011-12-11 20:04:56 +01001093
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001094 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001095 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001096 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001097 PyErr_NoMemory();
1098 return NULL;
1099 }
Victor Stinner84def372011-12-11 20:04:56 +01001100 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001101 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001102
Victor Stinnerfe226c02011-10-03 03:52:20 +02001103 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001104 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001105 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001106 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001107 _PyUnicode_WSTR_LENGTH(unicode) = length;
1108 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001109 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1110 PyObject_DEL(_PyUnicode_WSTR(unicode));
1111 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001112 if (!PyUnicode_IS_ASCII(unicode))
1113 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001114 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001115#ifdef Py_DEBUG
1116 unicode_fill_invalid(unicode, old_length);
1117#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001118 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1119 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001120 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121 return unicode;
1122}
1123
Alexander Belopolsky40018472011-02-26 01:02:56 +00001124static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001125resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001126{
Victor Stinner95663112011-10-04 01:03:50 +02001127 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001130 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001131
Victor Stinnerfe226c02011-10-03 03:52:20 +02001132 if (PyUnicode_IS_READY(unicode)) {
1133 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001134 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001136#ifdef Py_DEBUG
1137 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1138#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001139
1140 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001141 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001142 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1143 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001144
1145 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1146 PyErr_NoMemory();
1147 return -1;
1148 }
1149 new_size = (length + 1) * char_size;
1150
Victor Stinner7a9105a2011-12-12 00:13:42 +01001151 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1152 {
1153 PyObject_DEL(_PyUnicode_UTF8(unicode));
1154 _PyUnicode_UTF8(unicode) = NULL;
1155 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1156 }
1157
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 data = (PyObject *)PyObject_REALLOC(data, new_size);
1159 if (data == NULL) {
1160 PyErr_NoMemory();
1161 return -1;
1162 }
1163 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001164 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001165 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001166 _PyUnicode_WSTR_LENGTH(unicode) = length;
1167 }
1168 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001169 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001170 _PyUnicode_UTF8_LENGTH(unicode) = length;
1171 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001172 _PyUnicode_LENGTH(unicode) = length;
1173 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001174#ifdef Py_DEBUG
1175 unicode_fill_invalid(unicode, old_length);
1176#endif
Victor Stinner95663112011-10-04 01:03:50 +02001177 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001178 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001179 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001181 }
Victor Stinner95663112011-10-04 01:03:50 +02001182 assert(_PyUnicode_WSTR(unicode) != NULL);
1183
1184 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001185 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001186 PyErr_NoMemory();
1187 return -1;
1188 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001189 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001190 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001191 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001192 if (!wstr) {
1193 PyErr_NoMemory();
1194 return -1;
1195 }
1196 _PyUnicode_WSTR(unicode) = wstr;
1197 _PyUnicode_WSTR(unicode)[length] = 0;
1198 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001199 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 return 0;
1201}
1202
Victor Stinnerfe226c02011-10-03 03:52:20 +02001203static PyObject*
1204resize_copy(PyObject *unicode, Py_ssize_t length)
1205{
1206 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001207 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001209
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001210 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001211
1212 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1213 if (copy == NULL)
1214 return NULL;
1215
1216 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001217 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001218 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001219 }
1220 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001221 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001222
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001223 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001224 if (w == NULL)
1225 return NULL;
1226 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1227 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001228 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001229 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001230 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001231 }
1232}
1233
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001235 Ux0000 terminated; some code (e.g. new_identifier)
1236 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237
1238 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001239 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240
1241*/
1242
Alexander Belopolsky40018472011-02-26 01:02:56 +00001243static PyUnicodeObject *
1244_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001246 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248
Thomas Wouters477c8d52006-05-27 19:21:47 +00001249 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001250 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001251 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252 }
1253
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001254 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001255 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001256 return (PyUnicodeObject *)PyErr_NoMemory();
1257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 if (length < 0) {
1259 PyErr_SetString(PyExc_SystemError,
1260 "Negative size passed to _PyUnicode_New");
1261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 }
1263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1265 if (unicode == NULL)
1266 return NULL;
1267 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001268
1269 _PyUnicode_WSTR_LENGTH(unicode) = length;
1270 _PyUnicode_HASH(unicode) = -1;
1271 _PyUnicode_STATE(unicode).interned = 0;
1272 _PyUnicode_STATE(unicode).kind = 0;
1273 _PyUnicode_STATE(unicode).compact = 0;
1274 _PyUnicode_STATE(unicode).ready = 0;
1275 _PyUnicode_STATE(unicode).ascii = 0;
1276 _PyUnicode_DATA_ANY(unicode) = NULL;
1277 _PyUnicode_LENGTH(unicode) = 0;
1278 _PyUnicode_UTF8(unicode) = NULL;
1279 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001281 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1282 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001283 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001284 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001285 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001287
Jeremy Hyltond8082792003-09-16 19:41:39 +00001288 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001289 * the caller fails before initializing str -- unicode_resize()
1290 * reads str[0], and the Keep-Alive optimization can keep memory
1291 * allocated for str alive across a call to unicode_dealloc(unicode).
1292 * We don't want unicode_resize to read uninitialized memory in
1293 * that case.
1294 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295 _PyUnicode_WSTR(unicode)[0] = 0;
1296 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001297
Victor Stinner7931d9a2011-11-04 00:22:48 +01001298 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299 return unicode;
1300}
1301
Victor Stinnerf42dc442011-10-02 23:33:16 +02001302static const char*
1303unicode_kind_name(PyObject *unicode)
1304{
Victor Stinner42dfd712011-10-03 14:41:45 +02001305 /* don't check consistency: unicode_kind_name() is called from
1306 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001307 if (!PyUnicode_IS_COMPACT(unicode))
1308 {
1309 if (!PyUnicode_IS_READY(unicode))
1310 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001311 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001312 {
1313 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001314 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001315 return "legacy ascii";
1316 else
1317 return "legacy latin1";
1318 case PyUnicode_2BYTE_KIND:
1319 return "legacy UCS2";
1320 case PyUnicode_4BYTE_KIND:
1321 return "legacy UCS4";
1322 default:
1323 return "<legacy invalid kind>";
1324 }
1325 }
1326 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001327 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001328 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001329 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001330 return "ascii";
1331 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001332 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001333 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001334 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001335 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001336 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001337 default:
1338 return "<invalid compact kind>";
1339 }
1340}
1341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001344const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001345 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001346 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347}
1348
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001349const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001350 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 return _PyUnicode_COMPACT_DATA(unicode);
1352}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001353const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001354 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001355 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1357 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1358 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1359 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1360 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1361 return PyUnicode_DATA(unicode);
1362}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001363
1364void
1365_PyUnicode_Dump(PyObject *op)
1366{
1367 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001368 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1369 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001370 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001371
Victor Stinnera849a4b2011-10-03 12:12:11 +02001372 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001373 {
1374 if (ascii->state.ascii)
1375 data = (ascii + 1);
1376 else
1377 data = (compact + 1);
1378 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001379 else
1380 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001381 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001382
Victor Stinnera849a4b2011-10-03 12:12:11 +02001383 if (ascii->wstr == data)
1384 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001385 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001386
Victor Stinnera3b334d2011-10-03 13:53:37 +02001387 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001388 printf(" (%zu), ", compact->wstr_length);
1389 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001390 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001391 }
1392 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001393 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001394 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001395}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396#endif
1397
1398PyObject *
1399PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1400{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001401 /* Optimization for empty strings */
1402 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001403 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001404 }
1405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 PyObject *obj;
1407 PyCompactUnicodeObject *unicode;
1408 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001409 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001410 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 Py_ssize_t char_size;
1412 Py_ssize_t struct_size;
1413
Victor Stinner9e9d6892011-10-04 01:02:02 +02001414 is_ascii = 0;
1415 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 struct_size = sizeof(PyCompactUnicodeObject);
1417 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001418 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 char_size = 1;
1420 is_ascii = 1;
1421 struct_size = sizeof(PyASCIIObject);
1422 }
1423 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001424 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 char_size = 1;
1426 }
1427 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001428 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 char_size = 2;
1430 if (sizeof(wchar_t) == 2)
1431 is_sharing = 1;
1432 }
1433 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001434 if (maxchar > MAX_UNICODE) {
1435 PyErr_SetString(PyExc_SystemError,
1436 "invalid maximum character passed to PyUnicode_New");
1437 return NULL;
1438 }
Victor Stinner8f825062012-04-27 13:55:39 +02001439 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 char_size = 4;
1441 if (sizeof(wchar_t) == 4)
1442 is_sharing = 1;
1443 }
1444
1445 /* Ensure we won't overflow the size. */
1446 if (size < 0) {
1447 PyErr_SetString(PyExc_SystemError,
1448 "Negative size passed to PyUnicode_New");
1449 return NULL;
1450 }
1451 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1452 return PyErr_NoMemory();
1453
1454 /* Duplicated allocation code from _PyObject_New() instead of a call to
1455 * PyObject_New() so we are able to allocate space for the object and
1456 * it's data buffer.
1457 */
1458 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001459 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001461 }
1462 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
1464 unicode = (PyCompactUnicodeObject *)obj;
1465 if (is_ascii)
1466 data = ((PyASCIIObject*)obj) + 1;
1467 else
1468 data = unicode + 1;
1469 _PyUnicode_LENGTH(unicode) = size;
1470 _PyUnicode_HASH(unicode) = -1;
1471 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001472 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 _PyUnicode_STATE(unicode).compact = 1;
1474 _PyUnicode_STATE(unicode).ready = 1;
1475 _PyUnicode_STATE(unicode).ascii = is_ascii;
1476 if (is_ascii) {
1477 ((char*)data)[size] = 0;
1478 _PyUnicode_WSTR(unicode) = NULL;
1479 }
Victor Stinner8f825062012-04-27 13:55:39 +02001480 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 ((char*)data)[size] = 0;
1482 _PyUnicode_WSTR(unicode) = NULL;
1483 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001485 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 else {
1488 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001489 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001490 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001492 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493 ((Py_UCS4*)data)[size] = 0;
1494 if (is_sharing) {
1495 _PyUnicode_WSTR_LENGTH(unicode) = size;
1496 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1497 }
1498 else {
1499 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1500 _PyUnicode_WSTR(unicode) = NULL;
1501 }
1502 }
Victor Stinner8f825062012-04-27 13:55:39 +02001503#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001504 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001505#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001506 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 return obj;
1508}
1509
1510#if SIZEOF_WCHAR_T == 2
1511/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1512 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001513 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514
1515 This function assumes that unicode can hold one more code point than wstr
1516 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001517static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001519 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520{
1521 const wchar_t *iter;
1522 Py_UCS4 *ucs4_out;
1523
Victor Stinner910337b2011-10-03 03:20:16 +02001524 assert(unicode != NULL);
1525 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1527 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1528
1529 for (iter = begin; iter < end; ) {
1530 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1531 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001532 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1533 && (iter+1) < end
1534 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001535 {
Victor Stinner551ac952011-11-29 22:58:13 +01001536 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 iter += 2;
1538 }
1539 else {
1540 *ucs4_out++ = *iter;
1541 iter++;
1542 }
1543 }
1544 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1545 _PyUnicode_GET_LENGTH(unicode)));
1546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547}
1548#endif
1549
Victor Stinnercd9950f2011-10-02 00:34:53 +02001550static int
Victor Stinner488fa492011-12-12 00:01:39 +01001551unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001552{
Victor Stinner488fa492011-12-12 00:01:39 +01001553 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001554 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001555 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001556 return -1;
1557 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001558 return 0;
1559}
1560
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001561static int
1562_copy_characters(PyObject *to, Py_ssize_t to_start,
1563 PyObject *from, Py_ssize_t from_start,
1564 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001566 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001567 const void *from_data;
1568 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569
Victor Stinneree4544c2012-05-09 22:24:08 +02001570 assert(0 <= how_many);
1571 assert(0 <= from_start);
1572 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001573 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001574 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001575 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576
Victor Stinnerd3f08822012-05-29 12:57:52 +02001577 assert(PyUnicode_Check(to));
1578 assert(PyUnicode_IS_READY(to));
1579 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1580
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001581 if (how_many == 0)
1582 return 0;
1583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001585 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001587 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588
Victor Stinnerf1852262012-06-16 16:38:26 +02001589#ifdef Py_DEBUG
1590 if (!check_maxchar
1591 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1592 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001593 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001594 Py_UCS4 ch;
1595 Py_ssize_t i;
1596 for (i=0; i < how_many; i++) {
1597 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1598 assert(ch <= to_maxchar);
1599 }
1600 }
1601#endif
1602
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001603 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001604 if (check_maxchar
1605 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1606 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001607 /* Writing Latin-1 characters into an ASCII string requires to
1608 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001609 Py_UCS4 max_char;
1610 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001611 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001612 if (max_char >= 128)
1613 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001614 }
Christian Heimesf051e432016-09-13 20:22:02 +02001615 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001616 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001617 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001619 else if (from_kind == PyUnicode_1BYTE_KIND
1620 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001621 {
1622 _PyUnicode_CONVERT_BYTES(
1623 Py_UCS1, Py_UCS2,
1624 PyUnicode_1BYTE_DATA(from) + from_start,
1625 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1626 PyUnicode_2BYTE_DATA(to) + to_start
1627 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001628 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001629 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001630 && to_kind == PyUnicode_4BYTE_KIND)
1631 {
1632 _PyUnicode_CONVERT_BYTES(
1633 Py_UCS1, Py_UCS4,
1634 PyUnicode_1BYTE_DATA(from) + from_start,
1635 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1636 PyUnicode_4BYTE_DATA(to) + to_start
1637 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001638 }
1639 else if (from_kind == PyUnicode_2BYTE_KIND
1640 && to_kind == PyUnicode_4BYTE_KIND)
1641 {
1642 _PyUnicode_CONVERT_BYTES(
1643 Py_UCS2, Py_UCS4,
1644 PyUnicode_2BYTE_DATA(from) + from_start,
1645 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1646 PyUnicode_4BYTE_DATA(to) + to_start
1647 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001648 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001649 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001650 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1651
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001652 if (!check_maxchar) {
1653 if (from_kind == PyUnicode_2BYTE_KIND
1654 && to_kind == PyUnicode_1BYTE_KIND)
1655 {
1656 _PyUnicode_CONVERT_BYTES(
1657 Py_UCS2, Py_UCS1,
1658 PyUnicode_2BYTE_DATA(from) + from_start,
1659 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1660 PyUnicode_1BYTE_DATA(to) + to_start
1661 );
1662 }
1663 else if (from_kind == PyUnicode_4BYTE_KIND
1664 && to_kind == PyUnicode_1BYTE_KIND)
1665 {
1666 _PyUnicode_CONVERT_BYTES(
1667 Py_UCS4, Py_UCS1,
1668 PyUnicode_4BYTE_DATA(from) + from_start,
1669 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1670 PyUnicode_1BYTE_DATA(to) + to_start
1671 );
1672 }
1673 else if (from_kind == PyUnicode_4BYTE_KIND
1674 && to_kind == PyUnicode_2BYTE_KIND)
1675 {
1676 _PyUnicode_CONVERT_BYTES(
1677 Py_UCS4, Py_UCS2,
1678 PyUnicode_4BYTE_DATA(from) + from_start,
1679 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1680 PyUnicode_2BYTE_DATA(to) + to_start
1681 );
1682 }
1683 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001684 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001685 }
1686 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001687 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001688 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001689 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001690 Py_ssize_t i;
1691
Victor Stinnera0702ab2011-09-29 14:14:38 +02001692 for (i=0; i < how_many; i++) {
1693 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001694 if (ch > to_maxchar)
1695 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001696 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1697 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001698 }
1699 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001700 return 0;
1701}
1702
Victor Stinnerd3f08822012-05-29 12:57:52 +02001703void
1704_PyUnicode_FastCopyCharacters(
1705 PyObject *to, Py_ssize_t to_start,
1706 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001707{
1708 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1709}
1710
1711Py_ssize_t
1712PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1713 PyObject *from, Py_ssize_t from_start,
1714 Py_ssize_t how_many)
1715{
1716 int err;
1717
1718 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1719 PyErr_BadInternalCall();
1720 return -1;
1721 }
1722
Benjamin Petersonbac79492012-01-14 13:34:47 -05001723 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001724 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001725 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001726 return -1;
1727
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001728 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001729 PyErr_SetString(PyExc_IndexError, "string index out of range");
1730 return -1;
1731 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001732 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001733 PyErr_SetString(PyExc_IndexError, "string index out of range");
1734 return -1;
1735 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001736 if (how_many < 0) {
1737 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1738 return -1;
1739 }
1740 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001741 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1742 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001743 "Cannot write %zi characters at %zi "
1744 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001745 how_many, to_start, PyUnicode_GET_LENGTH(to));
1746 return -1;
1747 }
1748
1749 if (how_many == 0)
1750 return 0;
1751
Victor Stinner488fa492011-12-12 00:01:39 +01001752 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001753 return -1;
1754
1755 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1756 if (err) {
1757 PyErr_Format(PyExc_SystemError,
1758 "Cannot copy %s characters "
1759 "into a string of %s characters",
1760 unicode_kind_name(from),
1761 unicode_kind_name(to));
1762 return -1;
1763 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001764 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765}
1766
Victor Stinner17222162011-09-28 22:15:37 +02001767/* Find the maximum code point and count the number of surrogate pairs so a
1768 correct string length can be computed before converting a string to UCS4.
1769 This function counts single surrogates as a character and not as a pair.
1770
1771 Return 0 on success, or -1 on error. */
1772static int
1773find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1774 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775{
1776 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001777 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778
Victor Stinnerc53be962011-10-02 21:33:54 +02001779 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780 *num_surrogates = 0;
1781 *maxchar = 0;
1782
1783 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001785 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1786 && (iter+1) < end
1787 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1788 {
1789 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1790 ++(*num_surrogates);
1791 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 }
1793 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001795 {
1796 ch = *iter;
1797 iter++;
1798 }
1799 if (ch > *maxchar) {
1800 *maxchar = ch;
1801 if (*maxchar > MAX_UNICODE) {
1802 PyErr_Format(PyExc_ValueError,
1803 "character U+%x is not in range [U+0000; U+10ffff]",
1804 ch);
1805 return -1;
1806 }
1807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808 }
1809 return 0;
1810}
1811
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001812int
1813_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814{
1815 wchar_t *end;
1816 Py_UCS4 maxchar = 0;
1817 Py_ssize_t num_surrogates;
1818#if SIZEOF_WCHAR_T == 2
1819 Py_ssize_t length_wo_surrogates;
1820#endif
1821
Georg Brandl7597add2011-10-05 16:36:47 +02001822 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001823 strings were created using _PyObject_New() and where no canonical
1824 representation (the str field) has been set yet aka strings
1825 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001826 assert(_PyUnicode_CHECK(unicode));
1827 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001830 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001831 /* Actually, it should neither be interned nor be anything else: */
1832 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001835 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001836 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838
1839 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001840 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1841 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 PyErr_NoMemory();
1843 return -1;
1844 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001845 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 _PyUnicode_WSTR(unicode), end,
1847 PyUnicode_1BYTE_DATA(unicode));
1848 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1849 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1850 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1851 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001852 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001853 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001854 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 }
1856 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001857 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001858 _PyUnicode_UTF8(unicode) = NULL;
1859 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 }
1861 PyObject_FREE(_PyUnicode_WSTR(unicode));
1862 _PyUnicode_WSTR(unicode) = NULL;
1863 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1864 }
1865 /* In this case we might have to convert down from 4-byte native
1866 wchar_t to 2-byte unicode. */
1867 else if (maxchar < 65536) {
1868 assert(num_surrogates == 0 &&
1869 "FindMaxCharAndNumSurrogatePairs() messed up");
1870
Victor Stinner506f5922011-09-28 22:34:18 +02001871#if SIZEOF_WCHAR_T == 2
1872 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001873 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001874 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1875 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1876 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001877 _PyUnicode_UTF8(unicode) = NULL;
1878 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001879#else
1880 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001881 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001882 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001883 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001884 PyErr_NoMemory();
1885 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 }
Victor Stinner506f5922011-09-28 22:34:18 +02001887 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1888 _PyUnicode_WSTR(unicode), end,
1889 PyUnicode_2BYTE_DATA(unicode));
1890 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1891 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1892 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001893 _PyUnicode_UTF8(unicode) = NULL;
1894 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001895 PyObject_FREE(_PyUnicode_WSTR(unicode));
1896 _PyUnicode_WSTR(unicode) = NULL;
1897 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1898#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 }
1900 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1901 else {
1902#if SIZEOF_WCHAR_T == 2
1903 /* in case the native representation is 2-bytes, we need to allocate a
1904 new normalized 4-byte version. */
1905 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001906 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1907 PyErr_NoMemory();
1908 return -1;
1909 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001910 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1911 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 PyErr_NoMemory();
1913 return -1;
1914 }
1915 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1916 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001917 _PyUnicode_UTF8(unicode) = NULL;
1918 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001919 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1920 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001921 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 PyObject_FREE(_PyUnicode_WSTR(unicode));
1923 _PyUnicode_WSTR(unicode) = NULL;
1924 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1925#else
1926 assert(num_surrogates == 0);
1927
Victor Stinnerc3c74152011-10-02 20:39:55 +02001928 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001930 _PyUnicode_UTF8(unicode) = NULL;
1931 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1933#endif
1934 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1935 }
1936 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001937 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001938 return 0;
1939}
1940
Alexander Belopolsky40018472011-02-26 01:02:56 +00001941static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001942unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943{
Walter Dörwald16807132007-05-25 13:52:07 +00001944 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001945 case SSTATE_NOT_INTERNED:
1946 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001947
Benjamin Peterson29060642009-01-31 22:14:21 +00001948 case SSTATE_INTERNED_MORTAL:
1949 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001950 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001951#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001952 if (PyDict_DelItem(interned, unicode) != 0) {
1953 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1954 NULL);
1955 }
Victor Stinner607b1022020-05-05 18:50:30 +02001956#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001957 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001958
Benjamin Peterson29060642009-01-31 22:14:21 +00001959 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001960 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1961 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001962
Benjamin Peterson29060642009-01-31 22:14:21 +00001963 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001965 }
1966
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001967 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001969 }
1970 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001971 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001972 }
1973 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001974 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001975 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001976
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001977 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978}
1979
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001980#ifdef Py_DEBUG
1981static int
1982unicode_is_singleton(PyObject *unicode)
1983{
Victor Stinner90ed8a62020-06-24 00:34:07 +02001984 if (unicode == unicode_get_empty()) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001985 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001986 }
1987#ifdef LATIN1_SINGLETONS
1988 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001989 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1990 {
1991 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1992 if (ch < 256 && unicode_latin1[ch] == unicode)
1993 return 1;
1994 }
Victor Stinner607b1022020-05-05 18:50:30 +02001995#endif
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001996 return 0;
1997}
1998#endif
1999
Alexander Belopolsky40018472011-02-26 01:02:56 +00002000static int
Victor Stinner488fa492011-12-12 00:01:39 +01002001unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002002{
Victor Stinner488fa492011-12-12 00:01:39 +01002003 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002004 if (Py_REFCNT(unicode) != 1)
2005 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002006 if (_PyUnicode_HASH(unicode) != -1)
2007 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002008 if (PyUnicode_CHECK_INTERNED(unicode))
2009 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002010 if (!PyUnicode_CheckExact(unicode))
2011 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002012#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002013 /* singleton refcount is greater than 1 */
2014 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002015#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002016 return 1;
2017}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002018
Victor Stinnerfe226c02011-10-03 03:52:20 +02002019static int
2020unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2021{
2022 PyObject *unicode;
2023 Py_ssize_t old_length;
2024
2025 assert(p_unicode != NULL);
2026 unicode = *p_unicode;
2027
2028 assert(unicode != NULL);
2029 assert(PyUnicode_Check(unicode));
2030 assert(0 <= length);
2031
Victor Stinner910337b2011-10-03 03:20:16 +02002032 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002033 old_length = PyUnicode_WSTR_LENGTH(unicode);
2034 else
2035 old_length = PyUnicode_GET_LENGTH(unicode);
2036 if (old_length == length)
2037 return 0;
2038
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002039 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002040 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002041 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002042 return 0;
2043 }
2044
Victor Stinner488fa492011-12-12 00:01:39 +01002045 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002046 PyObject *copy = resize_copy(unicode, length);
2047 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002048 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002049 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002050 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002051 }
2052
Victor Stinnerfe226c02011-10-03 03:52:20 +02002053 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002054 PyObject *new_unicode = resize_compact(unicode, length);
2055 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002056 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002057 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002058 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002059 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002060 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002061}
2062
Alexander Belopolsky40018472011-02-26 01:02:56 +00002063int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002064PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002065{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002066 PyObject *unicode;
2067 if (p_unicode == NULL) {
2068 PyErr_BadInternalCall();
2069 return -1;
2070 }
2071 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002072 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002073 {
2074 PyErr_BadInternalCall();
2075 return -1;
2076 }
2077 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002078}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002079
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002080/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002081
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002082 WARNING: The function doesn't copy the terminating null character and
2083 doesn't check the maximum character (may write a latin1 character in an
2084 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002085static void
2086unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2087 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002088{
2089 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002090 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002091 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002092
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002093 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002094 switch (kind) {
2095 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002096#ifdef Py_DEBUG
2097 if (PyUnicode_IS_ASCII(unicode)) {
2098 Py_UCS4 maxchar = ucs1lib_find_max_char(
2099 (const Py_UCS1*)str,
2100 (const Py_UCS1*)str + len);
2101 assert(maxchar < 128);
2102 }
2103#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002104 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002105 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002106 }
2107 case PyUnicode_2BYTE_KIND: {
2108 Py_UCS2 *start = (Py_UCS2 *)data + index;
2109 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002110
Victor Stinner184252a2012-06-16 02:57:41 +02002111 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002112 *ucs2 = (Py_UCS2)*str;
2113
2114 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002115 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002116 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002117 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002118 Py_UCS4 *start = (Py_UCS4 *)data + index;
2119 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002120
Victor Stinner184252a2012-06-16 02:57:41 +02002121 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002122 *ucs4 = (Py_UCS4)*str;
2123
2124 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002125 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002127 default:
2128 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002129 }
2130}
2131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002132static PyObject*
2133get_latin1_char(unsigned char ch)
2134{
Victor Stinner607b1022020-05-05 18:50:30 +02002135 PyObject *unicode;
2136
2137#ifdef LATIN1_SINGLETONS
2138 unicode = unicode_latin1[ch];
2139 if (unicode) {
2140 Py_INCREF(unicode);
2141 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142 }
Victor Stinner607b1022020-05-05 18:50:30 +02002143#endif
2144
2145 unicode = PyUnicode_New(1, ch);
2146 if (!unicode) {
2147 return NULL;
2148 }
2149
2150 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2151 assert(_PyUnicode_CheckConsistency(unicode, 1));
2152
2153#ifdef LATIN1_SINGLETONS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154 Py_INCREF(unicode);
Victor Stinner607b1022020-05-05 18:50:30 +02002155 unicode_latin1[ch] = unicode;
2156#endif
Victor Stinnera464fc12011-10-02 20:39:30 +02002157 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158}
2159
Victor Stinner985a82a2014-01-03 12:53:47 +01002160static PyObject*
2161unicode_char(Py_UCS4 ch)
2162{
2163 PyObject *unicode;
2164
2165 assert(ch <= MAX_UNICODE);
2166
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002167 if (ch < 256)
2168 return get_latin1_char(ch);
2169
Victor Stinner985a82a2014-01-03 12:53:47 +01002170 unicode = PyUnicode_New(1, ch);
2171 if (unicode == NULL)
2172 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002173
2174 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2175 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002176 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002177 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002178 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2179 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2180 }
2181 assert(_PyUnicode_CheckConsistency(unicode, 1));
2182 return unicode;
2183}
2184
Alexander Belopolsky40018472011-02-26 01:02:56 +00002185PyObject *
2186PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002187{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002188 if (u == NULL)
2189 return (PyObject*)_PyUnicode_New(size);
2190
2191 if (size < 0) {
2192 PyErr_BadInternalCall();
2193 return NULL;
2194 }
2195
2196 return PyUnicode_FromWideChar(u, size);
2197}
2198
2199PyObject *
2200PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2201{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002202 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 Py_UCS4 maxchar = 0;
2204 Py_ssize_t num_surrogates;
2205
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002206 if (u == NULL && size != 0) {
2207 PyErr_BadInternalCall();
2208 return NULL;
2209 }
2210
2211 if (size == -1) {
2212 size = wcslen(u);
2213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002215 /* If the Unicode data is known at construction time, we can apply
2216 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002219 if (size == 0)
2220 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 /* Single character Unicode objects in the Latin-1 range are
2223 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002224 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 return get_latin1_char((unsigned char)*u);
2226
2227 /* If not empty and not single character, copy the Unicode data
2228 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002229 if (find_maxchar_surrogates(u, u + size,
2230 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 return NULL;
2232
Victor Stinner8faf8212011-12-08 22:14:11 +01002233 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 if (!unicode)
2235 return NULL;
2236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 switch (PyUnicode_KIND(unicode)) {
2238 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002239 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2241 break;
2242 case PyUnicode_2BYTE_KIND:
2243#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002244 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002246 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2248#endif
2249 break;
2250 case PyUnicode_4BYTE_KIND:
2251#if SIZEOF_WCHAR_T == 2
2252 /* This is the only case which has to process surrogates, thus
2253 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002254 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002255#else
2256 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002257 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258#endif
2259 break;
2260 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002261 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002264 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265}
2266
Alexander Belopolsky40018472011-02-26 01:02:56 +00002267PyObject *
2268PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002269{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002270 if (size < 0) {
2271 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002272 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002273 return NULL;
2274 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002275 if (u != NULL)
2276 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2277 else
2278 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002279}
2280
Alexander Belopolsky40018472011-02-26 01:02:56 +00002281PyObject *
2282PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002283{
2284 size_t size = strlen(u);
2285 if (size > PY_SSIZE_T_MAX) {
2286 PyErr_SetString(PyExc_OverflowError, "input too long");
2287 return NULL;
2288 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002289 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002290}
2291
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002292PyObject *
2293_PyUnicode_FromId(_Py_Identifier *id)
2294{
Victor Stinner297257f2020-06-02 14:39:45 +02002295 if (id->object) {
2296 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002297 }
Victor Stinner297257f2020-06-02 14:39:45 +02002298
2299 PyObject *obj;
2300 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2301 strlen(id->string),
2302 NULL, NULL);
2303 if (!obj) {
2304 return NULL;
2305 }
2306 PyUnicode_InternInPlace(&obj);
2307
2308 assert(!id->next);
2309 id->object = obj;
2310 id->next = static_strings;
2311 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002312 return id->object;
2313}
2314
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002315static void
2316unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002317{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002318 _Py_Identifier *tmp, *s = static_strings;
2319 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002320 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002321 tmp = s->next;
2322 s->next = NULL;
2323 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002324 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002325 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002326}
2327
Benjamin Peterson0df54292012-03-26 14:50:32 -04002328/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002329
Victor Stinnerd3f08822012-05-29 12:57:52 +02002330PyObject*
2331_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002332{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002333 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002334 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002335 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002336#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002337 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002338#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002339 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002340 }
Victor Stinner785938e2011-12-11 20:09:03 +01002341 unicode = PyUnicode_New(size, 127);
2342 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002343 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002344 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2345 assert(_PyUnicode_CheckConsistency(unicode, 1));
2346 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002347}
2348
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002349static Py_UCS4
2350kind_maxchar_limit(unsigned int kind)
2351{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002352 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002353 case PyUnicode_1BYTE_KIND:
2354 return 0x80;
2355 case PyUnicode_2BYTE_KIND:
2356 return 0x100;
2357 case PyUnicode_4BYTE_KIND:
2358 return 0x10000;
2359 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002360 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002361 }
2362}
2363
Victor Stinner702c7342011-10-05 13:50:52 +02002364static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002365_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002368 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002369
Serhiy Storchaka678db842013-01-26 12:16:36 +02002370 if (size == 0)
2371 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002372 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002373 if (size == 1)
2374 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002375
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002376 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002377 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 if (!res)
2379 return NULL;
2380 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002381 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002383}
2384
Victor Stinnere57b1c02011-09-28 22:20:48 +02002385static PyObject*
2386_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387{
2388 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002389 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002390
Serhiy Storchaka678db842013-01-26 12:16:36 +02002391 if (size == 0)
2392 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002393 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002394 if (size == 1)
2395 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002396
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002397 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002398 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 if (!res)
2400 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002401 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002403 else {
2404 _PyUnicode_CONVERT_BYTES(
2405 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2406 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002407 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 return res;
2409}
2410
Victor Stinnere57b1c02011-09-28 22:20:48 +02002411static PyObject*
2412_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413{
2414 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002415 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002416
Serhiy Storchaka678db842013-01-26 12:16:36 +02002417 if (size == 0)
2418 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002419 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002420 if (size == 1)
2421 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002422
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002423 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002424 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 if (!res)
2426 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002427 if (max_char < 256)
2428 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2429 PyUnicode_1BYTE_DATA(res));
2430 else if (max_char < 0x10000)
2431 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2432 PyUnicode_2BYTE_DATA(res));
2433 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002435 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 return res;
2437}
2438
2439PyObject*
2440PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2441{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002442 if (size < 0) {
2443 PyErr_SetString(PyExc_ValueError, "size must be positive");
2444 return NULL;
2445 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002446 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002448 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002450 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002452 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002453 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002454 PyErr_SetString(PyExc_SystemError, "invalid kind");
2455 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457}
2458
Victor Stinnerece58de2012-04-23 23:36:38 +02002459Py_UCS4
2460_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2461{
2462 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002463 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002464
2465 assert(PyUnicode_IS_READY(unicode));
2466 assert(0 <= start);
2467 assert(end <= PyUnicode_GET_LENGTH(unicode));
2468 assert(start <= end);
2469
2470 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2471 return PyUnicode_MAX_CHAR_VALUE(unicode);
2472
2473 if (start == end)
2474 return 127;
2475
Victor Stinner94d558b2012-04-27 22:26:58 +02002476 if (PyUnicode_IS_ASCII(unicode))
2477 return 127;
2478
Victor Stinnerece58de2012-04-23 23:36:38 +02002479 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002480 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002481 endptr = (char *)startptr + end * kind;
2482 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002483 switch(kind) {
2484 case PyUnicode_1BYTE_KIND:
2485 return ucs1lib_find_max_char(startptr, endptr);
2486 case PyUnicode_2BYTE_KIND:
2487 return ucs2lib_find_max_char(startptr, endptr);
2488 case PyUnicode_4BYTE_KIND:
2489 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002490 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002491 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002492 }
2493}
2494
Victor Stinner25a4b292011-10-06 12:31:55 +02002495/* Ensure that a string uses the most efficient storage, if it is not the
2496 case: create a new string with of the right kind. Write NULL into *p_unicode
2497 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002498static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002499unicode_adjust_maxchar(PyObject **p_unicode)
2500{
2501 PyObject *unicode, *copy;
2502 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002503 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002504 unsigned int kind;
2505
2506 assert(p_unicode != NULL);
2507 unicode = *p_unicode;
2508 assert(PyUnicode_IS_READY(unicode));
2509 if (PyUnicode_IS_ASCII(unicode))
2510 return;
2511
2512 len = PyUnicode_GET_LENGTH(unicode);
2513 kind = PyUnicode_KIND(unicode);
2514 if (kind == PyUnicode_1BYTE_KIND) {
2515 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002516 max_char = ucs1lib_find_max_char(u, u + len);
2517 if (max_char >= 128)
2518 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002519 }
2520 else if (kind == PyUnicode_2BYTE_KIND) {
2521 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002522 max_char = ucs2lib_find_max_char(u, u + len);
2523 if (max_char >= 256)
2524 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002525 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002526 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002527 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002528 max_char = ucs4lib_find_max_char(u, u + len);
2529 if (max_char >= 0x10000)
2530 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002531 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002532 else
2533 Py_UNREACHABLE();
2534
Victor Stinner25a4b292011-10-06 12:31:55 +02002535 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002536 if (copy != NULL)
2537 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002538 Py_DECREF(unicode);
2539 *p_unicode = copy;
2540}
2541
Victor Stinner034f6cf2011-09-30 02:26:44 +02002542PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002543_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002544{
Victor Stinner87af4f22011-11-21 23:03:47 +01002545 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002546 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002547
Victor Stinner034f6cf2011-09-30 02:26:44 +02002548 if (!PyUnicode_Check(unicode)) {
2549 PyErr_BadInternalCall();
2550 return NULL;
2551 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002552 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002553 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002554
Victor Stinner87af4f22011-11-21 23:03:47 +01002555 length = PyUnicode_GET_LENGTH(unicode);
2556 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002557 if (!copy)
2558 return NULL;
2559 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2560
Christian Heimesf051e432016-09-13 20:22:02 +02002561 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002562 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002563 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002564 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002565}
2566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567
Victor Stinnerbc603d12011-10-02 01:00:40 +02002568/* Widen Unicode objects to larger buffers. Don't write terminating null
2569 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002571static void*
2572unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002573{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002574 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002575
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002576 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002577 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002578 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002579 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002580 if (!result)
2581 return PyErr_NoMemory();
2582 assert(skind == PyUnicode_1BYTE_KIND);
2583 _PyUnicode_CONVERT_BYTES(
2584 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002585 (const Py_UCS1 *)data,
2586 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002587 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002589 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002590 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002591 if (!result)
2592 return PyErr_NoMemory();
2593 if (skind == PyUnicode_2BYTE_KIND) {
2594 _PyUnicode_CONVERT_BYTES(
2595 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002596 (const Py_UCS2 *)data,
2597 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002598 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002599 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002600 else {
2601 assert(skind == PyUnicode_1BYTE_KIND);
2602 _PyUnicode_CONVERT_BYTES(
2603 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002604 (const Py_UCS1 *)data,
2605 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002606 result);
2607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002609 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002610 Py_UNREACHABLE();
2611 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613}
2614
2615static Py_UCS4*
2616as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2617 int copy_null)
2618{
2619 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002620 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 Py_ssize_t len, targetlen;
2622 if (PyUnicode_READY(string) == -1)
2623 return NULL;
2624 kind = PyUnicode_KIND(string);
2625 data = PyUnicode_DATA(string);
2626 len = PyUnicode_GET_LENGTH(string);
2627 targetlen = len;
2628 if (copy_null)
2629 targetlen++;
2630 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002631 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 if (!target) {
2633 PyErr_NoMemory();
2634 return NULL;
2635 }
2636 }
2637 else {
2638 if (targetsize < targetlen) {
2639 PyErr_Format(PyExc_SystemError,
2640 "string is longer than the buffer");
2641 if (copy_null && 0 < targetsize)
2642 target[0] = 0;
2643 return NULL;
2644 }
2645 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002646 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002647 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002648 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002650 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002651 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002652 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2653 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002654 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002655 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002656 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002657 else {
2658 Py_UNREACHABLE();
2659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002660 if (copy_null)
2661 target[len] = 0;
2662 return target;
2663}
2664
2665Py_UCS4*
2666PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2667 int copy_null)
2668{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002669 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 PyErr_BadInternalCall();
2671 return NULL;
2672 }
2673 return as_ucs4(string, target, targetsize, copy_null);
2674}
2675
2676Py_UCS4*
2677PyUnicode_AsUCS4Copy(PyObject *string)
2678{
2679 return as_ucs4(string, NULL, 0, 1);
2680}
2681
Victor Stinner15a11362012-10-06 23:48:20 +02002682/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002683 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2684 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2685#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002686
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002687static int
2688unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2689 Py_ssize_t width, Py_ssize_t precision)
2690{
2691 Py_ssize_t length, fill, arglen;
2692 Py_UCS4 maxchar;
2693
2694 if (PyUnicode_READY(str) == -1)
2695 return -1;
2696
2697 length = PyUnicode_GET_LENGTH(str);
2698 if ((precision == -1 || precision >= length)
2699 && width <= length)
2700 return _PyUnicodeWriter_WriteStr(writer, str);
2701
2702 if (precision != -1)
2703 length = Py_MIN(precision, length);
2704
2705 arglen = Py_MAX(length, width);
2706 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2707 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2708 else
2709 maxchar = writer->maxchar;
2710
2711 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2712 return -1;
2713
2714 if (width > length) {
2715 fill = width - length;
2716 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2717 return -1;
2718 writer->pos += fill;
2719 }
2720
2721 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2722 str, 0, length);
2723 writer->pos += length;
2724 return 0;
2725}
2726
2727static int
Victor Stinner998b8062018-09-12 00:23:25 +02002728unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002729 Py_ssize_t width, Py_ssize_t precision)
2730{
2731 /* UTF-8 */
2732 Py_ssize_t length;
2733 PyObject *unicode;
2734 int res;
2735
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002736 if (precision == -1) {
2737 length = strlen(str);
2738 }
2739 else {
2740 length = 0;
2741 while (length < precision && str[length]) {
2742 length++;
2743 }
2744 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002745 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2746 if (unicode == NULL)
2747 return -1;
2748
2749 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2750 Py_DECREF(unicode);
2751 return res;
2752}
2753
Victor Stinner96865452011-03-01 23:44:09 +00002754static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002755unicode_fromformat_arg(_PyUnicodeWriter *writer,
2756 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002757{
Victor Stinnere215d962012-10-06 23:03:36 +02002758 const char *p;
2759 Py_ssize_t len;
2760 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002761 Py_ssize_t width;
2762 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002763 int longflag;
2764 int longlongflag;
2765 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002766 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002767
2768 p = f;
2769 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002770 zeropad = 0;
2771 if (*f == '0') {
2772 zeropad = 1;
2773 f++;
2774 }
Victor Stinner96865452011-03-01 23:44:09 +00002775
2776 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002777 width = -1;
2778 if (Py_ISDIGIT((unsigned)*f)) {
2779 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002780 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002781 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002782 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002783 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002784 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002785 return NULL;
2786 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002787 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002788 f++;
2789 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 }
2791 precision = -1;
2792 if (*f == '.') {
2793 f++;
2794 if (Py_ISDIGIT((unsigned)*f)) {
2795 precision = (*f - '0');
2796 f++;
2797 while (Py_ISDIGIT((unsigned)*f)) {
2798 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2799 PyErr_SetString(PyExc_ValueError,
2800 "precision too big");
2801 return NULL;
2802 }
2803 precision = (precision * 10) + (*f - '0');
2804 f++;
2805 }
2806 }
Victor Stinner96865452011-03-01 23:44:09 +00002807 if (*f == '%') {
2808 /* "%.3%s" => f points to "3" */
2809 f--;
2810 }
2811 }
2812 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002813 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002814 f--;
2815 }
Victor Stinner96865452011-03-01 23:44:09 +00002816
2817 /* Handle %ld, %lu, %lld and %llu. */
2818 longflag = 0;
2819 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002820 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002821 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002822 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002823 longflag = 1;
2824 ++f;
2825 }
Victor Stinner96865452011-03-01 23:44:09 +00002826 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002827 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002828 longlongflag = 1;
2829 f += 2;
2830 }
Victor Stinner96865452011-03-01 23:44:09 +00002831 }
2832 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002833 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002834 size_tflag = 1;
2835 ++f;
2836 }
Victor Stinnere215d962012-10-06 23:03:36 +02002837
2838 if (f[1] == '\0')
2839 writer->overallocate = 0;
2840
2841 switch (*f) {
2842 case 'c':
2843 {
2844 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002845 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002846 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002847 "character argument not in range(0x110000)");
2848 return NULL;
2849 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002850 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002851 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002852 break;
2853 }
2854
2855 case 'i':
2856 case 'd':
2857 case 'u':
2858 case 'x':
2859 {
2860 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002861 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002862 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002863
2864 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002865 if (longflag) {
2866 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2867 }
2868 else if (longlongflag) {
2869 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2870 }
2871 else if (size_tflag) {
2872 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2873 }
2874 else {
2875 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2876 }
Victor Stinnere215d962012-10-06 23:03:36 +02002877 }
2878 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002879 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002880 }
2881 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002882 if (longflag) {
2883 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2884 }
2885 else if (longlongflag) {
2886 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2887 }
2888 else if (size_tflag) {
2889 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2890 }
2891 else {
2892 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2893 }
Victor Stinnere215d962012-10-06 23:03:36 +02002894 }
2895 assert(len >= 0);
2896
Victor Stinnere215d962012-10-06 23:03:36 +02002897 if (precision < len)
2898 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002899
2900 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002901 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2902 return NULL;
2903
Victor Stinnere215d962012-10-06 23:03:36 +02002904 if (width > precision) {
2905 Py_UCS4 fillchar;
2906 fill = width - precision;
2907 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002908 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2909 return NULL;
2910 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002911 }
Victor Stinner15a11362012-10-06 23:48:20 +02002912 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002913 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002914 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2915 return NULL;
2916 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002917 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002918
Victor Stinner4a587072013-11-19 12:54:53 +01002919 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2920 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002921 break;
2922 }
2923
2924 case 'p':
2925 {
2926 char number[MAX_LONG_LONG_CHARS];
2927
2928 len = sprintf(number, "%p", va_arg(*vargs, void*));
2929 assert(len >= 0);
2930
2931 /* %p is ill-defined: ensure leading 0x. */
2932 if (number[1] == 'X')
2933 number[1] = 'x';
2934 else if (number[1] != 'x') {
2935 memmove(number + 2, number,
2936 strlen(number) + 1);
2937 number[0] = '0';
2938 number[1] = 'x';
2939 len += 2;
2940 }
2941
Victor Stinner4a587072013-11-19 12:54:53 +01002942 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002943 return NULL;
2944 break;
2945 }
2946
2947 case 's':
2948 {
2949 /* UTF-8 */
2950 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002951 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002952 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002953 break;
2954 }
2955
2956 case 'U':
2957 {
2958 PyObject *obj = va_arg(*vargs, PyObject *);
2959 assert(obj && _PyUnicode_CHECK(obj));
2960
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002961 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002962 return NULL;
2963 break;
2964 }
2965
2966 case 'V':
2967 {
2968 PyObject *obj = va_arg(*vargs, PyObject *);
2969 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002970 if (obj) {
2971 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002972 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002973 return NULL;
2974 }
2975 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002976 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002977 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002978 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002979 }
2980 break;
2981 }
2982
2983 case 'S':
2984 {
2985 PyObject *obj = va_arg(*vargs, PyObject *);
2986 PyObject *str;
2987 assert(obj);
2988 str = PyObject_Str(obj);
2989 if (!str)
2990 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002991 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002992 Py_DECREF(str);
2993 return NULL;
2994 }
2995 Py_DECREF(str);
2996 break;
2997 }
2998
2999 case 'R':
3000 {
3001 PyObject *obj = va_arg(*vargs, PyObject *);
3002 PyObject *repr;
3003 assert(obj);
3004 repr = PyObject_Repr(obj);
3005 if (!repr)
3006 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003007 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003008 Py_DECREF(repr);
3009 return NULL;
3010 }
3011 Py_DECREF(repr);
3012 break;
3013 }
3014
3015 case 'A':
3016 {
3017 PyObject *obj = va_arg(*vargs, PyObject *);
3018 PyObject *ascii;
3019 assert(obj);
3020 ascii = PyObject_ASCII(obj);
3021 if (!ascii)
3022 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003023 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003024 Py_DECREF(ascii);
3025 return NULL;
3026 }
3027 Py_DECREF(ascii);
3028 break;
3029 }
3030
3031 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003032 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003033 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003034 break;
3035
3036 default:
3037 /* if we stumble upon an unknown formatting code, copy the rest
3038 of the format string to the output string. (we cannot just
3039 skip the code, since there's no way to know what's in the
3040 argument list) */
3041 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003042 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003043 return NULL;
3044 f = p+len;
3045 return f;
3046 }
3047
3048 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003049 return f;
3050}
3051
Walter Dörwaldd2034312007-05-18 16:29:38 +00003052PyObject *
3053PyUnicode_FromFormatV(const char *format, va_list vargs)
3054{
Victor Stinnere215d962012-10-06 23:03:36 +02003055 va_list vargs2;
3056 const char *f;
3057 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003058
Victor Stinner8f674cc2013-04-17 23:02:17 +02003059 _PyUnicodeWriter_Init(&writer);
3060 writer.min_length = strlen(format) + 100;
3061 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003062
Benjamin Peterson0c212142016-09-20 20:39:33 -07003063 // Copy varags to be able to pass a reference to a subfunction.
3064 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003065
3066 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003067 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003068 f = unicode_fromformat_arg(&writer, f, &vargs2);
3069 if (f == NULL)
3070 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003071 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003072 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003073 const char *p;
3074 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003075
Victor Stinnere215d962012-10-06 23:03:36 +02003076 p = f;
3077 do
3078 {
3079 if ((unsigned char)*p > 127) {
3080 PyErr_Format(PyExc_ValueError,
3081 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3082 "string, got a non-ASCII byte: 0x%02x",
3083 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003084 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003085 }
3086 p++;
3087 }
3088 while (*p != '\0' && *p != '%');
3089 len = p - f;
3090
3091 if (*p == '\0')
3092 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003093
3094 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003095 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003096
3097 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003098 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003099 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003100 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003101 return _PyUnicodeWriter_Finish(&writer);
3102
3103 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003104 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003105 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003106 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003107}
3108
Walter Dörwaldd2034312007-05-18 16:29:38 +00003109PyObject *
3110PyUnicode_FromFormat(const char *format, ...)
3111{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003112 PyObject* ret;
3113 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003114
3115#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003116 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003117#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003118 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003119#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003120 ret = PyUnicode_FromFormatV(format, vargs);
3121 va_end(vargs);
3122 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003123}
3124
Serhiy Storchakac46db922018-10-23 22:58:24 +03003125static Py_ssize_t
3126unicode_get_widechar_size(PyObject *unicode)
3127{
3128 Py_ssize_t res;
3129
3130 assert(unicode != NULL);
3131 assert(_PyUnicode_CHECK(unicode));
3132
3133 if (_PyUnicode_WSTR(unicode) != NULL) {
3134 return PyUnicode_WSTR_LENGTH(unicode);
3135 }
3136 assert(PyUnicode_IS_READY(unicode));
3137
3138 res = _PyUnicode_LENGTH(unicode);
3139#if SIZEOF_WCHAR_T == 2
3140 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3141 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3142 const Py_UCS4 *end = s + res;
3143 for (; s < end; ++s) {
3144 if (*s > 0xFFFF) {
3145 ++res;
3146 }
3147 }
3148 }
3149#endif
3150 return res;
3151}
3152
3153static void
3154unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3155{
3156 const wchar_t *wstr;
3157
3158 assert(unicode != NULL);
3159 assert(_PyUnicode_CHECK(unicode));
3160
3161 wstr = _PyUnicode_WSTR(unicode);
3162 if (wstr != NULL) {
3163 memcpy(w, wstr, size * sizeof(wchar_t));
3164 return;
3165 }
3166 assert(PyUnicode_IS_READY(unicode));
3167
3168 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3169 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3170 for (; size--; ++s, ++w) {
3171 *w = *s;
3172 }
3173 }
3174 else {
3175#if SIZEOF_WCHAR_T == 4
3176 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3177 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3178 for (; size--; ++s, ++w) {
3179 *w = *s;
3180 }
3181#else
3182 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3183 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3184 for (; size--; ++s, ++w) {
3185 Py_UCS4 ch = *s;
3186 if (ch > 0xFFFF) {
3187 assert(ch <= MAX_UNICODE);
3188 /* encode surrogate pair in this case */
3189 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3190 if (!size--)
3191 break;
3192 *w = Py_UNICODE_LOW_SURROGATE(ch);
3193 }
3194 else {
3195 *w = ch;
3196 }
3197 }
3198#endif
3199 }
3200}
3201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003202#ifdef HAVE_WCHAR_H
3203
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003204/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003205
Victor Stinnerd88d9832011-09-06 02:00:05 +02003206 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003207 character) required to convert the unicode object. Ignore size argument.
3208
Victor Stinnerd88d9832011-09-06 02:00:05 +02003209 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003210 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003211 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003212Py_ssize_t
3213PyUnicode_AsWideChar(PyObject *unicode,
3214 wchar_t *w,
3215 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003216{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003217 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003218
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003219 if (unicode == NULL) {
3220 PyErr_BadInternalCall();
3221 return -1;
3222 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003223 if (!PyUnicode_Check(unicode)) {
3224 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003225 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003226 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003227
3228 res = unicode_get_widechar_size(unicode);
3229 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003230 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003231 }
3232
3233 if (size > res) {
3234 size = res + 1;
3235 }
3236 else {
3237 res = size;
3238 }
3239 unicode_copy_as_widechar(unicode, w, size);
3240 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003241}
3242
Victor Stinner137c34c2010-09-29 10:25:54 +00003243wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003244PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003245 Py_ssize_t *size)
3246{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003247 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003248 Py_ssize_t buflen;
3249
3250 if (unicode == NULL) {
3251 PyErr_BadInternalCall();
3252 return NULL;
3253 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003254 if (!PyUnicode_Check(unicode)) {
3255 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003256 return NULL;
3257 }
3258
Serhiy Storchakac46db922018-10-23 22:58:24 +03003259 buflen = unicode_get_widechar_size(unicode);
3260 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003261 if (buffer == NULL) {
3262 PyErr_NoMemory();
3263 return NULL;
3264 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003265 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3266 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003267 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003268 }
3269 else if (wcslen(buffer) != (size_t)buflen) {
3270 PyMem_FREE(buffer);
3271 PyErr_SetString(PyExc_ValueError,
3272 "embedded null character");
3273 return NULL;
3274 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003275 return buffer;
3276}
3277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003278#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003279
Alexander Belopolsky40018472011-02-26 01:02:56 +00003280PyObject *
3281PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003282{
Victor Stinner8faf8212011-12-08 22:14:11 +01003283 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003284 PyErr_SetString(PyExc_ValueError,
3285 "chr() arg not in range(0x110000)");
3286 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003287 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003288
Victor Stinner985a82a2014-01-03 12:53:47 +01003289 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003290}
3291
Alexander Belopolsky40018472011-02-26 01:02:56 +00003292PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003293PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003295 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003296 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003297 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003298 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003299 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003300 Py_INCREF(obj);
3301 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003302 }
3303 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003304 /* For a Unicode subtype that's not a Unicode object,
3305 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003306 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003307 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003308 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003309 "Can't convert '%.100s' object to str implicitly",
3310 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003311 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003312}
3313
Alexander Belopolsky40018472011-02-26 01:02:56 +00003314PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003315PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003316 const char *encoding,
3317 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003318{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003319 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003320 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003321
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003323 PyErr_BadInternalCall();
3324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003326
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003327 /* Decoding bytes objects is the most common case and should be fast */
3328 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003329 if (PyBytes_GET_SIZE(obj) == 0) {
3330 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3331 return NULL;
3332 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003333 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003334 }
3335 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003336 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3337 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003338 }
3339
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003340 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003341 PyErr_SetString(PyExc_TypeError,
3342 "decoding str is not supported");
3343 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003344 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003345
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003346 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3347 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3348 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003349 "decoding to str: need a bytes-like object, %.80s found",
3350 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003351 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003352 }
Tim Petersced69f82003-09-16 20:30:58 +00003353
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003354 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003355 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003356 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3357 return NULL;
3358 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003359 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003361
Serhiy Storchaka05997252013-01-26 12:14:02 +02003362 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003363 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003364 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365}
3366
Victor Stinnerebe17e02016-10-12 13:57:45 +02003367/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3368 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3369 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003370int
3371_Py_normalize_encoding(const char *encoding,
3372 char *lower,
3373 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003375 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003376 char *l;
3377 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003378 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379
Victor Stinner942889a2016-09-05 15:40:10 -07003380 assert(encoding != NULL);
3381
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003382 e = encoding;
3383 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003384 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003385 punct = 0;
3386 while (1) {
3387 char c = *e;
3388 if (c == 0) {
3389 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003390 }
Victor Stinner942889a2016-09-05 15:40:10 -07003391
3392 if (Py_ISALNUM(c) || c == '.') {
3393 if (punct && l != lower) {
3394 if (l == l_end) {
3395 return 0;
3396 }
3397 *l++ = '_';
3398 }
3399 punct = 0;
3400
3401 if (l == l_end) {
3402 return 0;
3403 }
3404 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003405 }
3406 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003407 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003408 }
Victor Stinner942889a2016-09-05 15:40:10 -07003409
3410 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003411 }
3412 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003413 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003414}
3415
Alexander Belopolsky40018472011-02-26 01:02:56 +00003416PyObject *
3417PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003418 Py_ssize_t size,
3419 const char *encoding,
3420 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003421{
3422 PyObject *buffer = NULL, *unicode;
3423 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003424 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3425
Victor Stinner22eb6892019-06-26 00:51:05 +02003426 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3427 return NULL;
3428 }
3429
Victor Stinnered076ed2019-06-26 01:49:32 +02003430 if (size == 0) {
3431 _Py_RETURN_UNICODE_EMPTY();
3432 }
3433
Victor Stinner942889a2016-09-05 15:40:10 -07003434 if (encoding == NULL) {
3435 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3436 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003437
Fred Drakee4315f52000-05-09 19:53:39 +00003438 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003439 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3440 char *lower = buflower;
3441
3442 /* Fast paths */
3443 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3444 lower += 3;
3445 if (*lower == '_') {
3446 /* Match "utf8" and "utf_8" */
3447 lower++;
3448 }
3449
3450 if (lower[0] == '8' && lower[1] == 0) {
3451 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3452 }
3453 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3454 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3455 }
3456 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3457 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3458 }
3459 }
3460 else {
3461 if (strcmp(lower, "ascii") == 0
3462 || strcmp(lower, "us_ascii") == 0) {
3463 return PyUnicode_DecodeASCII(s, size, errors);
3464 }
Steve Dowercc16be82016-09-08 10:35:16 -07003465 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003466 else if (strcmp(lower, "mbcs") == 0) {
3467 return PyUnicode_DecodeMBCS(s, size, errors);
3468 }
3469 #endif
3470 else if (strcmp(lower, "latin1") == 0
3471 || strcmp(lower, "latin_1") == 0
3472 || strcmp(lower, "iso_8859_1") == 0
3473 || strcmp(lower, "iso8859_1") == 0) {
3474 return PyUnicode_DecodeLatin1(s, size, errors);
3475 }
3476 }
Victor Stinner37296e82010-06-10 13:36:23 +00003477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478
3479 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003480 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003481 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003482 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003483 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 if (buffer == NULL)
3485 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003486 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 if (unicode == NULL)
3488 goto onError;
3489 if (!PyUnicode_Check(unicode)) {
3490 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003491 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003492 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003493 encoding,
3494 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 Py_DECREF(unicode);
3496 goto onError;
3497 }
3498 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003499 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003500
Benjamin Peterson29060642009-01-31 22:14:21 +00003501 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 Py_XDECREF(buffer);
3503 return NULL;
3504}
3505
Alexander Belopolsky40018472011-02-26 01:02:56 +00003506PyObject *
3507PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003508 const char *encoding,
3509 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003510{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003511 if (!PyUnicode_Check(unicode)) {
3512 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003513 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003514 }
3515
Serhiy Storchaka00939072016-10-27 21:05:49 +03003516 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3517 "PyUnicode_AsDecodedObject() is deprecated; "
3518 "use PyCodec_Decode() to decode from str", 1) < 0)
3519 return NULL;
3520
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003521 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003522 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003523
3524 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003525 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003526}
3527
Alexander Belopolsky40018472011-02-26 01:02:56 +00003528PyObject *
3529PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003530 const char *encoding,
3531 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003532{
3533 PyObject *v;
3534
3535 if (!PyUnicode_Check(unicode)) {
3536 PyErr_BadArgument();
3537 goto onError;
3538 }
3539
Serhiy Storchaka00939072016-10-27 21:05:49 +03003540 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3541 "PyUnicode_AsDecodedUnicode() is deprecated; "
3542 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3543 return NULL;
3544
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003545 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003546 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003547
3548 /* Decode via the codec registry */
3549 v = PyCodec_Decode(unicode, encoding, errors);
3550 if (v == NULL)
3551 goto onError;
3552 if (!PyUnicode_Check(v)) {
3553 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003554 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003555 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003556 encoding,
3557 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003558 Py_DECREF(v);
3559 goto onError;
3560 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003561 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003562
Benjamin Peterson29060642009-01-31 22:14:21 +00003563 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003564 return NULL;
3565}
3566
Alexander Belopolsky40018472011-02-26 01:02:56 +00003567PyObject *
3568PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003569 Py_ssize_t size,
3570 const char *encoding,
3571 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572{
3573 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003574
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003575 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3579 Py_DECREF(unicode);
3580 return v;
3581}
3582
Alexander Belopolsky40018472011-02-26 01:02:56 +00003583PyObject *
3584PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003585 const char *encoding,
3586 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003587{
3588 PyObject *v;
3589
3590 if (!PyUnicode_Check(unicode)) {
3591 PyErr_BadArgument();
3592 goto onError;
3593 }
3594
Serhiy Storchaka00939072016-10-27 21:05:49 +03003595 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3596 "PyUnicode_AsEncodedObject() is deprecated; "
3597 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3598 "or PyCodec_Encode() for generic encoding", 1) < 0)
3599 return NULL;
3600
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003601 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003603
3604 /* Encode via the codec registry */
3605 v = PyCodec_Encode(unicode, encoding, errors);
3606 if (v == NULL)
3607 goto onError;
3608 return v;
3609
Benjamin Peterson29060642009-01-31 22:14:21 +00003610 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003611 return NULL;
3612}
3613
Victor Stinner1b579672011-12-17 05:47:23 +01003614
Victor Stinner2cba6b82018-01-10 22:46:15 +01003615static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003616unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003617 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003618{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003619 Py_ssize_t wlen;
3620 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3621 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003622 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003623 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003624
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003625 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003626 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003627 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003628 return NULL;
3629 }
3630
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003631 char *str;
3632 size_t error_pos;
3633 const char *reason;
3634 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003635 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003636 PyMem_Free(wstr);
3637
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003638 if (res != 0) {
3639 if (res == -2) {
3640 PyObject *exc;
3641 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3642 "locale", unicode,
3643 (Py_ssize_t)error_pos,
3644 (Py_ssize_t)(error_pos+1),
3645 reason);
3646 if (exc != NULL) {
3647 PyCodec_StrictErrors(exc);
3648 Py_DECREF(exc);
3649 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003650 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003651 else if (res == -3) {
3652 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3653 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003654 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003655 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003656 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003657 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003658 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003659
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003660 PyObject *bytes = PyBytes_FromString(str);
3661 PyMem_RawFree(str);
3662 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003663}
3664
Victor Stinnerad158722010-10-27 00:25:46 +00003665PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003666PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3667{
Victor Stinner709d23d2019-05-02 14:56:30 -04003668 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3669 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003670}
3671
3672PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003673PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003674{
Victor Stinner81a7be32020-04-14 15:14:01 +02003675 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003676 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3677 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003678 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003679 fs_codec->error_handler,
3680 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003681 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003682#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003683 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003684 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003685 fs_codec->encoding,
3686 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003687 }
Victor Stinnerad158722010-10-27 00:25:46 +00003688#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003689 else {
3690 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3691 machinery is not ready and so cannot be used:
3692 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003693 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3694 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003695 assert(filesystem_errors != NULL);
3696 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3697 assert(errors != _Py_ERROR_UNKNOWN);
3698#ifdef _Py_FORCE_UTF8_FS_ENCODING
3699 return unicode_encode_utf8(unicode, errors, NULL);
3700#else
3701 return unicode_encode_locale(unicode, errors, 0);
3702#endif
3703 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003704}
3705
Alexander Belopolsky40018472011-02-26 01:02:56 +00003706PyObject *
3707PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003708 const char *encoding,
3709 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710{
3711 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003712 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003713
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 if (!PyUnicode_Check(unicode)) {
3715 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003716 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 }
Fred Drakee4315f52000-05-09 19:53:39 +00003718
Victor Stinner22eb6892019-06-26 00:51:05 +02003719 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3720 return NULL;
3721 }
3722
Victor Stinner942889a2016-09-05 15:40:10 -07003723 if (encoding == NULL) {
3724 return _PyUnicode_AsUTF8String(unicode, errors);
3725 }
3726
Fred Drakee4315f52000-05-09 19:53:39 +00003727 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003728 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3729 char *lower = buflower;
3730
3731 /* Fast paths */
3732 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3733 lower += 3;
3734 if (*lower == '_') {
3735 /* Match "utf8" and "utf_8" */
3736 lower++;
3737 }
3738
3739 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003741 }
3742 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3743 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3744 }
3745 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3746 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3747 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003748 }
Victor Stinner942889a2016-09-05 15:40:10 -07003749 else {
3750 if (strcmp(lower, "ascii") == 0
3751 || strcmp(lower, "us_ascii") == 0) {
3752 return _PyUnicode_AsASCIIString(unicode, errors);
3753 }
Steve Dowercc16be82016-09-08 10:35:16 -07003754#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003755 else if (strcmp(lower, "mbcs") == 0) {
3756 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3757 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003758#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003759 else if (strcmp(lower, "latin1") == 0 ||
3760 strcmp(lower, "latin_1") == 0 ||
3761 strcmp(lower, "iso_8859_1") == 0 ||
3762 strcmp(lower, "iso8859_1") == 0) {
3763 return _PyUnicode_AsLatin1String(unicode, errors);
3764 }
3765 }
Victor Stinner37296e82010-06-10 13:36:23 +00003766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767
3768 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003769 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003771 return NULL;
3772
3773 /* The normal path */
3774 if (PyBytes_Check(v))
3775 return v;
3776
3777 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003778 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003779 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003780 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003781
3782 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003783 "encoder %s returned bytearray instead of bytes; "
3784 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003785 encoding);
3786 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003787 Py_DECREF(v);
3788 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003789 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003790
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003791 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3792 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003793 Py_DECREF(v);
3794 return b;
3795 }
3796
3797 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003798 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003799 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003800 encoding,
3801 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003802 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003803 return NULL;
3804}
3805
Alexander Belopolsky40018472011-02-26 01:02:56 +00003806PyObject *
3807PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003808 const char *encoding,
3809 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003810{
3811 PyObject *v;
3812
3813 if (!PyUnicode_Check(unicode)) {
3814 PyErr_BadArgument();
3815 goto onError;
3816 }
3817
Serhiy Storchaka00939072016-10-27 21:05:49 +03003818 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3819 "PyUnicode_AsEncodedUnicode() is deprecated; "
3820 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3821 return NULL;
3822
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003823 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003824 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003825
3826 /* Encode via the codec registry */
3827 v = PyCodec_Encode(unicode, encoding, errors);
3828 if (v == NULL)
3829 goto onError;
3830 if (!PyUnicode_Check(v)) {
3831 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003832 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003833 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003834 encoding,
3835 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003836 Py_DECREF(v);
3837 goto onError;
3838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003840
Benjamin Peterson29060642009-01-31 22:14:21 +00003841 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842 return NULL;
3843}
3844
Victor Stinner2cba6b82018-01-10 22:46:15 +01003845static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003846unicode_decode_locale(const char *str, Py_ssize_t len,
3847 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003848{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003849 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3850 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003851 return NULL;
3852 }
3853
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003854 wchar_t *wstr;
3855 size_t wlen;
3856 const char *reason;
3857 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003858 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003859 if (res != 0) {
3860 if (res == -2) {
3861 PyObject *exc;
3862 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3863 "locale", str, len,
3864 (Py_ssize_t)wlen,
3865 (Py_ssize_t)(wlen + 1),
3866 reason);
3867 if (exc != NULL) {
3868 PyCodec_StrictErrors(exc);
3869 Py_DECREF(exc);
3870 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003871 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003872 else if (res == -3) {
3873 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3874 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003875 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003876 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003877 }
Victor Stinner2f197072011-12-17 07:08:30 +01003878 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003879 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003880
3881 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3882 PyMem_RawFree(wstr);
3883 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003884}
3885
3886PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003887PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3888 const char *errors)
3889{
Victor Stinner709d23d2019-05-02 14:56:30 -04003890 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3891 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003892}
3893
3894PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003895PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003896{
3897 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003898 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3899 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003900}
3901
3902
3903PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003904PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003905 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003906 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3907}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003908
Christian Heimes5894ba72007-11-04 11:43:14 +00003909PyObject*
3910PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3911{
Victor Stinner81a7be32020-04-14 15:14:01 +02003912 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003913 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3914 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003915 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003916 fs_codec->error_handler,
3917 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04003918 NULL);
3919 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003920#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003921 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003922 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003923 fs_codec->encoding,
3924 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003925 }
Victor Stinnerad158722010-10-27 00:25:46 +00003926#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003927 else {
3928 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3929 machinery is not ready and so cannot be used:
3930 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003931 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3932 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003933 assert(filesystem_errors != NULL);
3934 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3935 assert(errors != _Py_ERROR_UNKNOWN);
3936#ifdef _Py_FORCE_UTF8_FS_ENCODING
3937 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3938#else
3939 return unicode_decode_locale(s, size, errors, 0);
3940#endif
3941 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003942}
3943
Martin v. Löwis011e8422009-05-05 04:43:17 +00003944
3945int
3946PyUnicode_FSConverter(PyObject* arg, void* addr)
3947{
Brett Cannonec6ce872016-09-06 15:50:29 -07003948 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003949 PyObject *output = NULL;
3950 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003951 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003952 if (arg == NULL) {
3953 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003954 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003955 return 1;
3956 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003957 path = PyOS_FSPath(arg);
3958 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003959 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003960 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003961 if (PyBytes_Check(path)) {
3962 output = path;
3963 }
3964 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3965 output = PyUnicode_EncodeFSDefault(path);
3966 Py_DECREF(path);
3967 if (!output) {
3968 return 0;
3969 }
3970 assert(PyBytes_Check(output));
3971 }
3972
Victor Stinner0ea2a462010-04-30 00:22:08 +00003973 size = PyBytes_GET_SIZE(output);
3974 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003975 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003976 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003977 Py_DECREF(output);
3978 return 0;
3979 }
3980 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003981 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003982}
3983
3984
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003985int
3986PyUnicode_FSDecoder(PyObject* arg, void* addr)
3987{
Brett Cannona5711202016-09-06 19:36:01 -07003988 int is_buffer = 0;
3989 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003990 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003991 if (arg == NULL) {
3992 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003993 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003994 return 1;
3995 }
Brett Cannona5711202016-09-06 19:36:01 -07003996
3997 is_buffer = PyObject_CheckBuffer(arg);
3998 if (!is_buffer) {
3999 path = PyOS_FSPath(arg);
4000 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004001 return 0;
4002 }
Brett Cannona5711202016-09-06 19:36:01 -07004003 }
4004 else {
4005 path = arg;
4006 Py_INCREF(arg);
4007 }
4008
4009 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004010 output = path;
4011 }
4012 else if (PyBytes_Check(path) || is_buffer) {
4013 PyObject *path_bytes = NULL;
4014
4015 if (!PyBytes_Check(path) &&
4016 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004017 "path should be string, bytes, or os.PathLike, not %.200s",
4018 Py_TYPE(arg)->tp_name)) {
4019 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004020 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004021 }
4022 path_bytes = PyBytes_FromObject(path);
4023 Py_DECREF(path);
4024 if (!path_bytes) {
4025 return 0;
4026 }
4027 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4028 PyBytes_GET_SIZE(path_bytes));
4029 Py_DECREF(path_bytes);
4030 if (!output) {
4031 return 0;
4032 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004033 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004034 else {
4035 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004036 "path should be string, bytes, or os.PathLike, not %.200s",
4037 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004038 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004039 return 0;
4040 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004041 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004042 Py_DECREF(output);
4043 return 0;
4044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004046 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004047 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004048 Py_DECREF(output);
4049 return 0;
4050 }
4051 *(PyObject**)addr = output;
4052 return Py_CLEANUP_SUPPORTED;
4053}
4054
4055
Inada Naoki02a4d572020-02-27 13:48:59 +09004056static int unicode_fill_utf8(PyObject *unicode);
4057
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004058const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004060{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004061 if (!PyUnicode_Check(unicode)) {
4062 PyErr_BadArgument();
4063 return NULL;
4064 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004065 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004066 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004068 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004069 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004070 return NULL;
4071 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 }
4073
4074 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004075 *psize = PyUnicode_UTF8_LENGTH(unicode);
4076 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004077}
4078
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004079const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004081{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4083}
4084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004085Py_UNICODE *
4086PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004088 if (!PyUnicode_Check(unicode)) {
4089 PyErr_BadArgument();
4090 return NULL;
4091 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004092 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4093 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004094 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004095 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004096 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097
Serhiy Storchakac46db922018-10-23 22:58:24 +03004098 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4099 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4100 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004101 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004103 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4104 if (w == NULL) {
4105 PyErr_NoMemory();
4106 return NULL;
4107 }
4108 unicode_copy_as_widechar(unicode, w, wlen + 1);
4109 _PyUnicode_WSTR(unicode) = w;
4110 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4111 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004112 }
4113 }
4114 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004115 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004116 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004117}
4118
Inada Naoki2c4928d2020-06-17 20:09:44 +09004119/* Deprecated APIs */
4120
4121_Py_COMP_DIAG_PUSH
4122_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4123
Alexander Belopolsky40018472011-02-26 01:02:56 +00004124Py_UNICODE *
4125PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004127 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128}
4129
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004130const Py_UNICODE *
4131_PyUnicode_AsUnicode(PyObject *unicode)
4132{
4133 Py_ssize_t size;
4134 const Py_UNICODE *wstr;
4135
4136 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4137 if (wstr && wcslen(wstr) != (size_t)size) {
4138 PyErr_SetString(PyExc_ValueError, "embedded null character");
4139 return NULL;
4140 }
4141 return wstr;
4142}
4143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144
Alexander Belopolsky40018472011-02-26 01:02:56 +00004145Py_ssize_t
4146PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147{
4148 if (!PyUnicode_Check(unicode)) {
4149 PyErr_BadArgument();
4150 goto onError;
4151 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004152 if (_PyUnicode_WSTR(unicode) == NULL) {
4153 if (PyUnicode_AsUnicode(unicode) == NULL)
4154 goto onError;
4155 }
4156 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 return -1;
4160}
4161
Inada Naoki2c4928d2020-06-17 20:09:44 +09004162_Py_COMP_DIAG_POP
4163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004164Py_ssize_t
4165PyUnicode_GetLength(PyObject *unicode)
4166{
Victor Stinner07621332012-06-16 04:53:46 +02004167 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004168 PyErr_BadArgument();
4169 return -1;
4170 }
Victor Stinner07621332012-06-16 04:53:46 +02004171 if (PyUnicode_READY(unicode) == -1)
4172 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004173 return PyUnicode_GET_LENGTH(unicode);
4174}
4175
4176Py_UCS4
4177PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4178{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004179 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004180 int kind;
4181
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004182 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004183 PyErr_BadArgument();
4184 return (Py_UCS4)-1;
4185 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004186 if (PyUnicode_READY(unicode) == -1) {
4187 return (Py_UCS4)-1;
4188 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004189 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004190 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004191 return (Py_UCS4)-1;
4192 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004193 data = PyUnicode_DATA(unicode);
4194 kind = PyUnicode_KIND(unicode);
4195 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196}
4197
4198int
4199PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4200{
4201 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004202 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004203 return -1;
4204 }
Victor Stinner488fa492011-12-12 00:01:39 +01004205 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004206 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004207 PyErr_SetString(PyExc_IndexError, "string index out of range");
4208 return -1;
4209 }
Victor Stinner488fa492011-12-12 00:01:39 +01004210 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004211 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004212 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4213 PyErr_SetString(PyExc_ValueError, "character out of range");
4214 return -1;
4215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004216 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4217 index, ch);
4218 return 0;
4219}
4220
Alexander Belopolsky40018472011-02-26 01:02:56 +00004221const char *
4222PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004223{
Victor Stinner42cb4622010-09-01 19:39:01 +00004224 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004225}
4226
Victor Stinner554f3f02010-06-16 23:33:54 +00004227/* create or adjust a UnicodeDecodeError */
4228static void
4229make_decode_exception(PyObject **exceptionObject,
4230 const char *encoding,
4231 const char *input, Py_ssize_t length,
4232 Py_ssize_t startpos, Py_ssize_t endpos,
4233 const char *reason)
4234{
4235 if (*exceptionObject == NULL) {
4236 *exceptionObject = PyUnicodeDecodeError_Create(
4237 encoding, input, length, startpos, endpos, reason);
4238 }
4239 else {
4240 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4241 goto onError;
4242 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4243 goto onError;
4244 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4245 goto onError;
4246 }
4247 return;
4248
4249onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004250 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004251}
4252
Steve Dowercc16be82016-09-08 10:35:16 -07004253#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004254static int
4255widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4256{
4257 if (newsize > *size) {
4258 wchar_t *newbuf = *buf;
4259 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4260 PyErr_NoMemory();
4261 return -1;
4262 }
4263 *buf = newbuf;
4264 }
4265 *size = newsize;
4266 return 0;
4267}
4268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269/* error handling callback helper:
4270 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004271 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 and adjust various state variables.
4273 return 0 on success, -1 on error
4274*/
4275
Alexander Belopolsky40018472011-02-26 01:02:56 +00004276static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004277unicode_decode_call_errorhandler_wchar(
4278 const char *errors, PyObject **errorHandler,
4279 const char *encoding, const char *reason,
4280 const char **input, const char **inend, Py_ssize_t *startinpos,
4281 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004282 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004284 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285
4286 PyObject *restuple = NULL;
4287 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004288 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004289 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004290 Py_ssize_t requiredsize;
4291 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004292 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004293 wchar_t *repwstr;
4294 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295
4296 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 *errorHandler = PyCodec_LookupError(errors);
4298 if (*errorHandler == NULL)
4299 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 }
4301
Victor Stinner554f3f02010-06-16 23:33:54 +00004302 make_decode_exception(exceptionObject,
4303 encoding,
4304 *input, *inend - *input,
4305 *startinpos, *endinpos,
4306 reason);
4307 if (*exceptionObject == NULL)
4308 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004309
Petr Viktorinffd97532020-02-11 17:46:57 +01004310 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004311 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004313 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004314 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004317 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004319
4320 /* Copy back the bytes variables, which might have been modified by the
4321 callback */
4322 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4323 if (!inputobj)
4324 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004325 *input = PyBytes_AS_STRING(inputobj);
4326 insize = PyBytes_GET_SIZE(inputobj);
4327 *inend = *input + insize;
4328 /* we can DECREF safely, as the exception has another reference,
4329 so the object won't go away. */
4330 Py_DECREF(inputobj);
4331
4332 if (newpos<0)
4333 newpos = insize+newpos;
4334 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004335 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004336 goto onError;
4337 }
4338
4339 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4340 if (repwstr == NULL)
4341 goto onError;
4342 /* need more space? (at least enough for what we
4343 have+the replacement+the rest of the string (starting
4344 at the new input position), so we won't have to check space
4345 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004346 requiredsize = *outpos;
4347 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4348 goto overflow;
4349 requiredsize += repwlen;
4350 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4351 goto overflow;
4352 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004353 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004355 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004357 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004358 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004359 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004360 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004361 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004362 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004363 *endinpos = newpos;
4364 *inptr = *input + newpos;
4365
4366 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004367 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004368 return 0;
4369
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004370 overflow:
4371 PyErr_SetString(PyExc_OverflowError,
4372 "decoded result is too long for a Python string");
4373
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004374 onError:
4375 Py_XDECREF(restuple);
4376 return -1;
4377}
Steve Dowercc16be82016-09-08 10:35:16 -07004378#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004379
4380static int
4381unicode_decode_call_errorhandler_writer(
4382 const char *errors, PyObject **errorHandler,
4383 const char *encoding, const char *reason,
4384 const char **input, const char **inend, Py_ssize_t *startinpos,
4385 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4386 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4387{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004388 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004389
4390 PyObject *restuple = NULL;
4391 PyObject *repunicode = NULL;
4392 Py_ssize_t insize;
4393 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004394 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004395 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004396 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004397 int need_to_grow = 0;
4398 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004399
4400 if (*errorHandler == NULL) {
4401 *errorHandler = PyCodec_LookupError(errors);
4402 if (*errorHandler == NULL)
4403 goto onError;
4404 }
4405
4406 make_decode_exception(exceptionObject,
4407 encoding,
4408 *input, *inend - *input,
4409 *startinpos, *endinpos,
4410 reason);
4411 if (*exceptionObject == NULL)
4412 goto onError;
4413
Petr Viktorinffd97532020-02-11 17:46:57 +01004414 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004415 if (restuple == NULL)
4416 goto onError;
4417 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004418 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 goto onError;
4420 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004421 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004422 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004423
4424 /* Copy back the bytes variables, which might have been modified by the
4425 callback */
4426 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4427 if (!inputobj)
4428 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004429 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004430 *input = PyBytes_AS_STRING(inputobj);
4431 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004432 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004433 /* we can DECREF safely, as the exception has another reference,
4434 so the object won't go away. */
4435 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004436
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004439 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004440 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004441 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443
Victor Stinner170ca6f2013-04-18 00:25:28 +02004444 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004445 if (replen > 1) {
4446 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004447 need_to_grow = 1;
4448 }
4449 new_inptr = *input + newpos;
4450 if (*inend - new_inptr > remain) {
4451 /* We don't know the decoding algorithm here so we make the worst
4452 assumption that one byte decodes to one unicode character.
4453 If unfortunately one byte could decode to more unicode characters,
4454 the decoder may write out-of-bound then. Is it possible for the
4455 algorithms using this function? */
4456 writer->min_length += *inend - new_inptr - remain;
4457 need_to_grow = 1;
4458 }
4459 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004460 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004461 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004462 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4463 goto onError;
4464 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004465 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004466 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004467
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004469 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004472 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004473 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478}
4479
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480/* --- UTF-7 Codec -------------------------------------------------------- */
4481
Antoine Pitrou244651a2009-05-04 18:56:13 +00004482/* See RFC2152 for details. We encode conservatively and decode liberally. */
4483
4484/* Three simple macros defining base-64. */
4485
4486/* Is c a base-64 character? */
4487
4488#define IS_BASE64(c) \
4489 (((c) >= 'A' && (c) <= 'Z') || \
4490 ((c) >= 'a' && (c) <= 'z') || \
4491 ((c) >= '0' && (c) <= '9') || \
4492 (c) == '+' || (c) == '/')
4493
4494/* given that c is a base-64 character, what is its base-64 value? */
4495
4496#define FROM_BASE64(c) \
4497 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4498 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4499 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4500 (c) == '+' ? 62 : 63)
4501
4502/* What is the base-64 character of the bottom 6 bits of n? */
4503
4504#define TO_BASE64(n) \
4505 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4506
4507/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4508 * decoded as itself. We are permissive on decoding; the only ASCII
4509 * byte not decoding to itself is the + which begins a base64
4510 * string. */
4511
4512#define DECODE_DIRECT(c) \
4513 ((c) <= 127 && (c) != '+')
4514
4515/* The UTF-7 encoder treats ASCII characters differently according to
4516 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4517 * the above). See RFC2152. This array identifies these different
4518 * sets:
4519 * 0 : "Set D"
4520 * alphanumeric and '(),-./:?
4521 * 1 : "Set O"
4522 * !"#$%&*;<=>@[]^_`{|}
4523 * 2 : "whitespace"
4524 * ht nl cr sp
4525 * 3 : special (must be base64 encoded)
4526 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4527 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528
Tim Petersced69f82003-09-16 20:30:58 +00004529static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004530char utf7_category[128] = {
4531/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4532 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4533/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4534 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4535/* sp ! " # $ % & ' ( ) * + , - . / */
4536 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4537/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4538 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4539/* @ A B C D E F G H I J K L M N O */
4540 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4541/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4543/* ` a b c d e f g h i j k l m n o */
4544 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4545/* p q r s t u v w x y z { | } ~ del */
4546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547};
4548
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549/* ENCODE_DIRECT: this character should be encoded as itself. The
4550 * answer depends on whether we are encoding set O as itself, and also
4551 * on whether we are encoding whitespace as itself. RFC2152 makes it
4552 * clear that the answers to these questions vary between
4553 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004554
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555#define ENCODE_DIRECT(c, directO, directWS) \
4556 ((c) < 128 && (c) > 0 && \
4557 ((utf7_category[(c)] == 0) || \
4558 (directWS && (utf7_category[(c)] == 2)) || \
4559 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560
Alexander Belopolsky40018472011-02-26 01:02:56 +00004561PyObject *
4562PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004563 Py_ssize_t size,
4564 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004565{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004566 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4567}
4568
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569/* The decoder. The only state we preserve is our read position,
4570 * i.e. how many characters we have consumed. So if we end in the
4571 * middle of a shift sequence we have to back off the read position
4572 * and the output to the beginning of the sequence, otherwise we lose
4573 * all the shift state (seen bits, number of bits seen, high
4574 * surrogate). */
4575
Alexander Belopolsky40018472011-02-26 01:02:56 +00004576PyObject *
4577PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004578 Py_ssize_t size,
4579 const char *errors,
4580 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004581{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004583 Py_ssize_t startinpos;
4584 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004586 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004587 const char *errmsg = "";
4588 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 unsigned int base64bits = 0;
4591 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004592 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 PyObject *errorHandler = NULL;
4594 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004595
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004596 if (size == 0) {
4597 if (consumed)
4598 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004599 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004600 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004601
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004602 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004603 _PyUnicodeWriter_Init(&writer);
4604 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004605
4606 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004607 e = s + size;
4608
4609 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004610 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004612 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 if (inShift) { /* in a base-64 section */
4615 if (IS_BASE64(ch)) { /* consume a base-64 character */
4616 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4617 base64bits += 6;
4618 s++;
4619 if (base64bits >= 16) {
4620 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004621 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 base64bits -= 16;
4623 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004624 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 if (surrogate) {
4626 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004627 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4628 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004629 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004630 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004631 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004632 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 }
4634 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004635 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004636 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004637 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 }
4639 }
Victor Stinner551ac952011-11-29 22:58:13 +01004640 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641 /* first surrogate */
4642 surrogate = outCh;
4643 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004645 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004646 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 }
4648 }
4649 }
4650 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004651 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 if (base64bits > 0) { /* left-over bits */
4653 if (base64bits >= 6) {
4654 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004655 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004656 errmsg = "partial character in shift sequence";
4657 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004658 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004659 else {
4660 /* Some bits remain; they should be zero */
4661 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004662 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 errmsg = "non-zero padding bits in shift sequence";
4664 goto utf7Error;
4665 }
4666 }
4667 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004668 if (surrogate && DECODE_DIRECT(ch)) {
4669 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4670 goto onError;
4671 }
4672 surrogate = 0;
4673 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 /* '-' is absorbed; other terminating
4675 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004676 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004677 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004678 }
4679 }
4680 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004681 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682 s++; /* consume '+' */
4683 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004684 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004685 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004686 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004688 else if (s < e && !IS_BASE64(*s)) {
4689 s++;
4690 errmsg = "ill-formed sequence";
4691 goto utf7Error;
4692 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004694 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004695 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004696 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004698 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004699 }
4700 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004701 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004702 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004703 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004704 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706 else {
4707 startinpos = s-starts;
4708 s++;
4709 errmsg = "unexpected special character";
4710 goto utf7Error;
4711 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004712 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004713utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004715 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 errors, &errorHandler,
4717 "utf7", errmsg,
4718 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004719 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004721 }
4722
Antoine Pitrou244651a2009-05-04 18:56:13 +00004723 /* end of string */
4724
4725 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4726 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004727 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004728 if (surrogate ||
4729 (base64bits >= 6) ||
4730 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004731 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004732 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004733 errors, &errorHandler,
4734 "utf7", "unterminated shift sequence",
4735 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004736 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004737 goto onError;
4738 if (s < e)
4739 goto restart;
4740 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004741 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004742
4743 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004744 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004745 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004746 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004747 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004748 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004749 writer.kind, writer.data, shiftOutStart);
4750 Py_XDECREF(errorHandler);
4751 Py_XDECREF(exc);
4752 _PyUnicodeWriter_Dealloc(&writer);
4753 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004754 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004755 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004756 }
4757 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004758 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004759 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004760 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 Py_XDECREF(errorHandler);
4763 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004764 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767 Py_XDECREF(errorHandler);
4768 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004769 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770 return NULL;
4771}
4772
4773
Alexander Belopolsky40018472011-02-26 01:02:56 +00004774PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004775_PyUnicode_EncodeUTF7(PyObject *str,
4776 int base64SetO,
4777 int base64WhiteSpace,
4778 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004779{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004780 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004781 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004782 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004783 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004784 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004785 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004786 unsigned int base64bits = 0;
4787 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004788 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004789 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004790
Benjamin Petersonbac79492012-01-14 13:34:47 -05004791 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004792 return NULL;
4793 kind = PyUnicode_KIND(str);
4794 data = PyUnicode_DATA(str);
4795 len = PyUnicode_GET_LENGTH(str);
4796
4797 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004799
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004800 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004801 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004802 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004803 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004804 if (v == NULL)
4805 return NULL;
4806
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004807 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004808 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004809 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004810
Antoine Pitrou244651a2009-05-04 18:56:13 +00004811 if (inShift) {
4812 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4813 /* shifting out */
4814 if (base64bits) { /* output remaining bits */
4815 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4816 base64buffer = 0;
4817 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004818 }
4819 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004820 /* Characters not in the BASE64 set implicitly unshift the sequence
4821 so no '-' is required, except if the character is itself a '-' */
4822 if (IS_BASE64(ch) || ch == '-') {
4823 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004824 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004825 *out++ = (char) ch;
4826 }
4827 else {
4828 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004829 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004830 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004831 else { /* not in a shift sequence */
4832 if (ch == '+') {
4833 *out++ = '+';
4834 *out++ = '-';
4835 }
4836 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4837 *out++ = (char) ch;
4838 }
4839 else {
4840 *out++ = '+';
4841 inShift = 1;
4842 goto encode_char;
4843 }
4844 }
4845 continue;
4846encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004847 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004848 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004849
Antoine Pitrou244651a2009-05-04 18:56:13 +00004850 /* code first surrogate */
4851 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004852 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004853 while (base64bits >= 6) {
4854 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4855 base64bits -= 6;
4856 }
4857 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004858 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004859 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004860 base64bits += 16;
4861 base64buffer = (base64buffer << 16) | ch;
4862 while (base64bits >= 6) {
4863 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4864 base64bits -= 6;
4865 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004866 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004867 if (base64bits)
4868 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4869 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004870 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004871 if (_PyBytes_Resize(&v, out - start) < 0)
4872 return NULL;
4873 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004874}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004875PyObject *
4876PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4877 Py_ssize_t size,
4878 int base64SetO,
4879 int base64WhiteSpace,
4880 const char *errors)
4881{
4882 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004883 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004884 if (tmp == NULL)
4885 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004886 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004887 base64WhiteSpace, errors);
4888 Py_DECREF(tmp);
4889 return result;
4890}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004891
Antoine Pitrou244651a2009-05-04 18:56:13 +00004892#undef IS_BASE64
4893#undef FROM_BASE64
4894#undef TO_BASE64
4895#undef DECODE_DIRECT
4896#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004897
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898/* --- UTF-8 Codec -------------------------------------------------------- */
4899
Alexander Belopolsky40018472011-02-26 01:02:56 +00004900PyObject *
4901PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004902 Py_ssize_t size,
4903 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904{
Walter Dörwald69652032004-09-07 20:24:22 +00004905 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4906}
4907
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004908#include "stringlib/asciilib.h"
4909#include "stringlib/codecs.h"
4910#include "stringlib/undef.h"
4911
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004912#include "stringlib/ucs1lib.h"
4913#include "stringlib/codecs.h"
4914#include "stringlib/undef.h"
4915
4916#include "stringlib/ucs2lib.h"
4917#include "stringlib/codecs.h"
4918#include "stringlib/undef.h"
4919
4920#include "stringlib/ucs4lib.h"
4921#include "stringlib/codecs.h"
4922#include "stringlib/undef.h"
4923
Antoine Pitrouab868312009-01-10 15:40:25 +00004924/* Mask to quickly check whether a C 'long' contains a
4925 non-ASCII, UTF8-encoded char. */
4926#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004927# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004928#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004929# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004930#else
4931# error C 'long' size should be either 4 or 8!
4932#endif
4933
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934static Py_ssize_t
4935ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004936{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004938 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004939
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004940 /*
4941 * Issue #17237: m68k is a bit different from most architectures in
4942 * that objects do not use "natural alignment" - for example, int and
4943 * long are only aligned at 2-byte boundaries. Therefore the assert()
4944 * won't work; also, tests have shown that skipping the "optimised
4945 * version" will even speed up m68k.
4946 */
4947#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004948#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004949 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4950 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004951 /* Fast path, see in STRINGLIB(utf8_decode) for
4952 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004953 /* Help allocation */
4954 const char *_p = p;
4955 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 while (_p < aligned_end) {
4957 unsigned long value = *(const unsigned long *) _p;
4958 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004959 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 *((unsigned long *)q) = value;
4961 _p += SIZEOF_LONG;
4962 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004963 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004964 p = _p;
4965 while (p < end) {
4966 if ((unsigned char)*p & 0x80)
4967 break;
4968 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004970 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004972#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004973#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004974 while (p < end) {
4975 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4976 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004977 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004978 /* Help allocation */
4979 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004980 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004981 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 if (value & ASCII_CHAR_MASK)
4983 break;
4984 _p += SIZEOF_LONG;
4985 }
4986 p = _p;
4987 if (_p == end)
4988 break;
4989 }
4990 if ((unsigned char)*p & 0x80)
4991 break;
4992 ++p;
4993 }
4994 memcpy(dest, start, p - start);
4995 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996}
Antoine Pitrouab868312009-01-10 15:40:25 +00004997
Victor Stinner709d23d2019-05-02 14:56:30 -04004998static PyObject *
4999unicode_decode_utf8(const char *s, Py_ssize_t size,
5000 _Py_error_handler error_handler, const char *errors,
5001 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005002{
Victor Stinner785938e2011-12-11 20:09:03 +01005003 if (size == 0) {
5004 if (consumed)
5005 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005006 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005007 }
5008
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5010 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01005011 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 *consumed = 1;
5013 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005014 }
5015
Inada Naoki770847a2019-06-24 12:30:24 +09005016 const char *starts = s;
5017 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005018
Inada Naoki770847a2019-06-24 12:30:24 +09005019 // fast path: try ASCII string.
5020 PyObject *u = PyUnicode_New(size, 127);
5021 if (u == NULL) {
5022 return NULL;
5023 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005024 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005025 if (s == end) {
5026 return u;
5027 }
5028
5029 // Use _PyUnicodeWriter after fast path is failed.
5030 _PyUnicodeWriter writer;
5031 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5032 writer.pos = s - starts;
5033
5034 Py_ssize_t startinpos, endinpos;
5035 const char *errmsg = "";
5036 PyObject *error_handler_obj = NULL;
5037 PyObject *exc = NULL;
5038
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039 while (s < end) {
5040 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005041 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005042
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005044 if (PyUnicode_IS_ASCII(writer.buffer))
5045 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005046 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005047 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005048 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005049 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005050 } else {
5051 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005052 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005053 }
5054
5055 switch (ch) {
5056 case 0:
5057 if (s == end || consumed)
5058 goto End;
5059 errmsg = "unexpected end of data";
5060 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005061 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005062 break;
5063 case 1:
5064 errmsg = "invalid start byte";
5065 startinpos = s - starts;
5066 endinpos = startinpos + 1;
5067 break;
5068 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005069 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5070 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5071 {
5072 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005073 goto End;
5074 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005075 /* fall through */
5076 case 3:
5077 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078 errmsg = "invalid continuation byte";
5079 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005080 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 break;
5082 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005083 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 goto onError;
5085 continue;
5086 }
5087
Victor Stinner1d65d912015-10-05 13:43:50 +02005088 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005089 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005090
5091 switch (error_handler) {
5092 case _Py_ERROR_IGNORE:
5093 s += (endinpos - startinpos);
5094 break;
5095
5096 case _Py_ERROR_REPLACE:
5097 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5098 goto onError;
5099 s += (endinpos - startinpos);
5100 break;
5101
5102 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005103 {
5104 Py_ssize_t i;
5105
Victor Stinner1d65d912015-10-05 13:43:50 +02005106 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5107 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005108 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005109 ch = (Py_UCS4)(unsigned char)(starts[i]);
5110 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5111 ch + 0xdc00);
5112 writer.pos++;
5113 }
5114 s += (endinpos - startinpos);
5115 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005116 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005117
5118 default:
5119 if (unicode_decode_call_errorhandler_writer(
5120 errors, &error_handler_obj,
5121 "utf-8", errmsg,
5122 &starts, &end, &startinpos, &endinpos, &exc, &s,
5123 &writer))
5124 goto onError;
5125 }
Victor Stinner785938e2011-12-11 20:09:03 +01005126 }
5127
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005128End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005129 if (consumed)
5130 *consumed = s - starts;
5131
Victor Stinner1d65d912015-10-05 13:43:50 +02005132 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005133 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005134 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005135
5136onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005137 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005138 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005139 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005140 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005141}
5142
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005143
Victor Stinner709d23d2019-05-02 14:56:30 -04005144PyObject *
5145PyUnicode_DecodeUTF8Stateful(const char *s,
5146 Py_ssize_t size,
5147 const char *errors,
5148 Py_ssize_t *consumed)
5149{
5150 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5151}
5152
5153
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005154/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5155 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005156
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005157 On success, write a pointer to a newly allocated wide character string into
5158 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5159 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005160
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005161 On memory allocation failure, return -1.
5162
5163 On decoding error (if surrogateescape is zero), return -2. If wlen is
5164 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5165 is not NULL, write the decoding error message into *reason. */
5166int
5167_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005168 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005169{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005170 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005171 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005172 wchar_t *unicode;
5173 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005174
Victor Stinner3d4226a2018-08-29 22:21:32 +02005175 int surrogateescape = 0;
5176 int surrogatepass = 0;
5177 switch (errors)
5178 {
5179 case _Py_ERROR_STRICT:
5180 break;
5181 case _Py_ERROR_SURROGATEESCAPE:
5182 surrogateescape = 1;
5183 break;
5184 case _Py_ERROR_SURROGATEPASS:
5185 surrogatepass = 1;
5186 break;
5187 default:
5188 return -3;
5189 }
5190
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005191 /* Note: size will always be longer than the resulting Unicode
5192 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005193 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005194 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005195 }
5196
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005197 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005198 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005199 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005200 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005201
5202 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005203 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005204 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005205 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005206 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005207#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005208 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005209#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005210 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005211#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005212 if (ch > 0xFF) {
5213#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005214 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005215#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005216 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005217 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005218 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5219 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5220#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005221 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005222 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005223 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005224 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005225 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005226
5227 if (surrogateescape) {
5228 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5229 }
5230 else {
5231 /* Is it a valid three-byte code? */
5232 if (surrogatepass
5233 && (e - s) >= 3
5234 && (s[0] & 0xf0) == 0xe0
5235 && (s[1] & 0xc0) == 0x80
5236 && (s[2] & 0xc0) == 0x80)
5237 {
5238 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5239 s += 3;
5240 unicode[outpos++] = ch;
5241 }
5242 else {
5243 PyMem_RawFree(unicode );
5244 if (reason != NULL) {
5245 switch (ch) {
5246 case 0:
5247 *reason = "unexpected end of data";
5248 break;
5249 case 1:
5250 *reason = "invalid start byte";
5251 break;
5252 /* 2, 3, 4 */
5253 default:
5254 *reason = "invalid continuation byte";
5255 break;
5256 }
5257 }
5258 if (wlen != NULL) {
5259 *wlen = s - orig_s;
5260 }
5261 return -2;
5262 }
5263 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005264 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005265 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005266 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005267 if (wlen) {
5268 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005269 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005270 *wstr = unicode;
5271 return 0;
5272}
5273
Victor Stinner5f9cf232019-03-19 01:46:25 +01005274
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005275wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005276_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5277 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005278{
5279 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005280 int res = _Py_DecodeUTF8Ex(arg, arglen,
5281 &wstr, wlen,
5282 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005283 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005284 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5285 assert(res != -3);
5286 if (wlen) {
5287 *wlen = (size_t)res;
5288 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005289 return NULL;
5290 }
5291 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005292}
5293
Antoine Pitrouab868312009-01-10 15:40:25 +00005294
Victor Stinnere47e6982017-12-21 15:45:16 +01005295/* UTF-8 encoder using the surrogateescape error handler .
5296
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005297 On success, return 0 and write the newly allocated character string (use
5298 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005299
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005300 On encoding failure, return -2 and write the position of the invalid
5301 surrogate character into *error_pos (if error_pos is set) and the decoding
5302 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005303
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005304 On memory allocation failure, return -1. */
5305int
5306_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005307 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005308{
5309 const Py_ssize_t max_char_size = 4;
5310 Py_ssize_t len = wcslen(text);
5311
5312 assert(len >= 0);
5313
Victor Stinner3d4226a2018-08-29 22:21:32 +02005314 int surrogateescape = 0;
5315 int surrogatepass = 0;
5316 switch (errors)
5317 {
5318 case _Py_ERROR_STRICT:
5319 break;
5320 case _Py_ERROR_SURROGATEESCAPE:
5321 surrogateescape = 1;
5322 break;
5323 case _Py_ERROR_SURROGATEPASS:
5324 surrogatepass = 1;
5325 break;
5326 default:
5327 return -3;
5328 }
5329
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005330 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5331 return -1;
5332 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005333 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005334 if (raw_malloc) {
5335 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005336 }
5337 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005338 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005339 }
5340 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005341 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005342 }
5343
5344 char *p = bytes;
5345 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005346 for (i = 0; i < len; ) {
5347 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005348 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005349 i++;
5350#if Py_UNICODE_SIZE == 2
5351 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5352 && i < len
5353 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5354 {
5355 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5356 i++;
5357 }
5358#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005359
5360 if (ch < 0x80) {
5361 /* Encode ASCII */
5362 *p++ = (char) ch;
5363
5364 }
5365 else if (ch < 0x0800) {
5366 /* Encode Latin-1 */
5367 *p++ = (char)(0xc0 | (ch >> 6));
5368 *p++ = (char)(0x80 | (ch & 0x3f));
5369 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005370 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005371 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005372 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005373 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005374 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005375 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005376 if (reason != NULL) {
5377 *reason = "encoding error";
5378 }
5379 if (raw_malloc) {
5380 PyMem_RawFree(bytes);
5381 }
5382 else {
5383 PyMem_Free(bytes);
5384 }
5385 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005386 }
5387 *p++ = (char)(ch & 0xff);
5388 }
5389 else if (ch < 0x10000) {
5390 *p++ = (char)(0xe0 | (ch >> 12));
5391 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5392 *p++ = (char)(0x80 | (ch & 0x3f));
5393 }
5394 else { /* ch >= 0x10000 */
5395 assert(ch <= MAX_UNICODE);
5396 /* Encode UCS4 Unicode ordinals */
5397 *p++ = (char)(0xf0 | (ch >> 18));
5398 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5399 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5400 *p++ = (char)(0x80 | (ch & 0x3f));
5401 }
5402 }
5403 *p++ = '\0';
5404
5405 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005406 char *bytes2;
5407 if (raw_malloc) {
5408 bytes2 = PyMem_RawRealloc(bytes, final_size);
5409 }
5410 else {
5411 bytes2 = PyMem_Realloc(bytes, final_size);
5412 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005413 if (bytes2 == NULL) {
5414 if (error_pos != NULL) {
5415 *error_pos = (size_t)-1;
5416 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005417 if (raw_malloc) {
5418 PyMem_RawFree(bytes);
5419 }
5420 else {
5421 PyMem_Free(bytes);
5422 }
5423 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005424 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005425 *str = bytes2;
5426 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005427}
5428
5429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005430/* Primary internal function which creates utf8 encoded bytes objects.
5431
5432 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005433 and allocate exactly as much space needed at the end. Else allocate the
5434 maximum possible needed (4 result bytes per Unicode character), and return
5435 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005436*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005437static PyObject *
5438unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5439 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441 if (!PyUnicode_Check(unicode)) {
5442 PyErr_BadArgument();
5443 return NULL;
5444 }
5445
5446 if (PyUnicode_READY(unicode) == -1)
5447 return NULL;
5448
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005449 if (PyUnicode_UTF8(unicode))
5450 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5451 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452
Inada Naoki02a4d572020-02-27 13:48:59 +09005453 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005454 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005455 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5456
5457 _PyBytesWriter writer;
5458 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005459
Benjamin Petersonead6b532011-12-20 17:23:42 -06005460 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005461 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005462 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005463 case PyUnicode_1BYTE_KIND:
5464 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5465 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005466 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5467 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005468 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005469 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5470 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005471 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005472 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5473 break;
Tim Peters602f7402002-04-27 18:03:26 +00005474 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005475
5476 if (end == NULL) {
5477 _PyBytesWriter_Dealloc(&writer);
5478 return NULL;
5479 }
5480 return _PyBytesWriter_Finish(&writer, end);
5481}
5482
5483static int
5484unicode_fill_utf8(PyObject *unicode)
5485{
5486 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5487 assert(!PyUnicode_IS_ASCII(unicode));
5488
5489 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005490 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005491 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5492
5493 _PyBytesWriter writer;
5494 char *end;
5495
5496 switch (kind) {
5497 default:
5498 Py_UNREACHABLE();
5499 case PyUnicode_1BYTE_KIND:
5500 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5501 _Py_ERROR_STRICT, NULL);
5502 break;
5503 case PyUnicode_2BYTE_KIND:
5504 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5505 _Py_ERROR_STRICT, NULL);
5506 break;
5507 case PyUnicode_4BYTE_KIND:
5508 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5509 _Py_ERROR_STRICT, NULL);
5510 break;
5511 }
5512 if (end == NULL) {
5513 _PyBytesWriter_Dealloc(&writer);
5514 return -1;
5515 }
5516
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005517 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005518 PyBytes_AS_STRING(writer.buffer);
5519 Py_ssize_t len = end - start;
5520
5521 char *cache = PyObject_MALLOC(len + 1);
5522 if (cache == NULL) {
5523 _PyBytesWriter_Dealloc(&writer);
5524 PyErr_NoMemory();
5525 return -1;
5526 }
5527 _PyUnicode_UTF8(unicode) = cache;
5528 _PyUnicode_UTF8_LENGTH(unicode) = len;
5529 memcpy(cache, start, len);
5530 cache[len] = '\0';
5531 _PyBytesWriter_Dealloc(&writer);
5532 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533}
5534
Alexander Belopolsky40018472011-02-26 01:02:56 +00005535PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005536_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5537{
5538 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5539}
5540
5541
5542PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5544 Py_ssize_t size,
5545 const char *errors)
5546{
5547 PyObject *v, *unicode;
5548
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005549 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 if (unicode == NULL)
5551 return NULL;
5552 v = _PyUnicode_AsUTF8String(unicode, errors);
5553 Py_DECREF(unicode);
5554 return v;
5555}
5556
5557PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005558PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005560 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561}
5562
Walter Dörwald41980ca2007-08-16 21:55:45 +00005563/* --- UTF-32 Codec ------------------------------------------------------- */
5564
5565PyObject *
5566PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 Py_ssize_t size,
5568 const char *errors,
5569 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005570{
5571 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5572}
5573
5574PyObject *
5575PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 Py_ssize_t size,
5577 const char *errors,
5578 int *byteorder,
5579 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005580{
5581 const char *starts = s;
5582 Py_ssize_t startinpos;
5583 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005584 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005585 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005586 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005587 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005588 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005589 PyObject *errorHandler = NULL;
5590 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005591
Andy Lestere6be9b52020-02-11 20:28:35 -06005592 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005593 e = q + size;
5594
5595 if (byteorder)
5596 bo = *byteorder;
5597
5598 /* Check for BOM marks (U+FEFF) in the input and adjust current
5599 byte order setting accordingly. In native mode, the leading BOM
5600 mark is skipped, in all other modes, it is copied to the output
5601 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005602 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005603 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005604 if (bom == 0x0000FEFF) {
5605 bo = -1;
5606 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005608 else if (bom == 0xFFFE0000) {
5609 bo = 1;
5610 q += 4;
5611 }
5612 if (byteorder)
5613 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005614 }
5615
Victor Stinnere64322e2012-10-30 23:12:47 +01005616 if (q == e) {
5617 if (consumed)
5618 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005619 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005620 }
5621
Victor Stinnere64322e2012-10-30 23:12:47 +01005622#ifdef WORDS_BIGENDIAN
5623 le = bo < 0;
5624#else
5625 le = bo <= 0;
5626#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005627 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005628
Victor Stinner8f674cc2013-04-17 23:02:17 +02005629 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005630 writer.min_length = (e - q + 3) / 4;
5631 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005632 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005633
Victor Stinnere64322e2012-10-30 23:12:47 +01005634 while (1) {
5635 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005636 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005637
Victor Stinnere64322e2012-10-30 23:12:47 +01005638 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005639 enum PyUnicode_Kind kind = writer.kind;
5640 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005641 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005642 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005643 if (le) {
5644 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005645 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005646 if (ch > maxch)
5647 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005648 if (kind != PyUnicode_1BYTE_KIND &&
5649 Py_UNICODE_IS_SURROGATE(ch))
5650 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005651 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005652 q += 4;
5653 } while (q <= last);
5654 }
5655 else {
5656 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005657 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005658 if (ch > maxch)
5659 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005660 if (kind != PyUnicode_1BYTE_KIND &&
5661 Py_UNICODE_IS_SURROGATE(ch))
5662 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005663 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005664 q += 4;
5665 } while (q <= last);
5666 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005667 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005668 }
5669
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005670 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005671 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005672 startinpos = ((const char *)q) - starts;
5673 endinpos = startinpos + 4;
5674 }
5675 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005676 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005678 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005680 startinpos = ((const char *)q) - starts;
5681 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005683 else {
5684 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005685 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005686 goto onError;
5687 q += 4;
5688 continue;
5689 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005690 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005691 startinpos = ((const char *)q) - starts;
5692 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005694
5695 /* The remaining input chars are ignored if the callback
5696 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005697 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005699 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005701 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005703 }
5704
Walter Dörwald41980ca2007-08-16 21:55:45 +00005705 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005707
Walter Dörwald41980ca2007-08-16 21:55:45 +00005708 Py_XDECREF(errorHandler);
5709 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005710 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005711
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005713 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005714 Py_XDECREF(errorHandler);
5715 Py_XDECREF(exc);
5716 return NULL;
5717}
5718
5719PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005720_PyUnicode_EncodeUTF32(PyObject *str,
5721 const char *errors,
5722 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005723{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005724 enum PyUnicode_Kind kind;
5725 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005726 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005727 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005728 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005729#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005730 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005731#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005732 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005733#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005734 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005735 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005736 PyObject *errorHandler = NULL;
5737 PyObject *exc = NULL;
5738 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005739
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740 if (!PyUnicode_Check(str)) {
5741 PyErr_BadArgument();
5742 return NULL;
5743 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005744 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005745 return NULL;
5746 kind = PyUnicode_KIND(str);
5747 data = PyUnicode_DATA(str);
5748 len = PyUnicode_GET_LENGTH(str);
5749
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005750 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005751 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005752 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005753 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005754 if (v == NULL)
5755 return NULL;
5756
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005757 /* output buffer is 4-bytes aligned */
5758 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005759 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005760 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005761 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005762 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005763 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005764
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005765 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005766 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005767 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005768 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769 else
5770 encoding = "utf-32";
5771
5772 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005773 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5774 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005775 }
5776
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005777 pos = 0;
5778 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005779 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005780
5781 if (kind == PyUnicode_2BYTE_KIND) {
5782 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5783 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005784 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005785 else {
5786 assert(kind == PyUnicode_4BYTE_KIND);
5787 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5788 &out, native_ordering);
5789 }
5790 if (pos == len)
5791 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005792
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005793 rep = unicode_encode_call_errorhandler(
5794 errors, &errorHandler,
5795 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005796 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005797 if (!rep)
5798 goto error;
5799
5800 if (PyBytes_Check(rep)) {
5801 repsize = PyBytes_GET_SIZE(rep);
5802 if (repsize & 3) {
5803 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005804 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005805 "surrogates not allowed");
5806 goto error;
5807 }
5808 moreunits = repsize / 4;
5809 }
5810 else {
5811 assert(PyUnicode_Check(rep));
5812 if (PyUnicode_READY(rep) < 0)
5813 goto error;
5814 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5815 if (!PyUnicode_IS_ASCII(rep)) {
5816 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005817 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005818 "surrogates not allowed");
5819 goto error;
5820 }
5821 }
5822
5823 /* four bytes are reserved for each surrogate */
5824 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005825 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005826 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005827 /* integer overflow */
5828 PyErr_NoMemory();
5829 goto error;
5830 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005831 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005832 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005833 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005834 }
5835
5836 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005837 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005838 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005840 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005841 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5842 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005843 }
5844
5845 Py_CLEAR(rep);
5846 }
5847
5848 /* Cut back to size actually needed. This is necessary for, for example,
5849 encoding of a string containing isolated surrogates and the 'ignore'
5850 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005851 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005852 if (nsize != PyBytes_GET_SIZE(v))
5853 _PyBytes_Resize(&v, nsize);
5854 Py_XDECREF(errorHandler);
5855 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005856 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005857 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005858 error:
5859 Py_XDECREF(rep);
5860 Py_XDECREF(errorHandler);
5861 Py_XDECREF(exc);
5862 Py_XDECREF(v);
5863 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005864}
5865
Alexander Belopolsky40018472011-02-26 01:02:56 +00005866PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005867PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5868 Py_ssize_t size,
5869 const char *errors,
5870 int byteorder)
5871{
5872 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005873 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874 if (tmp == NULL)
5875 return NULL;
5876 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5877 Py_DECREF(tmp);
5878 return result;
5879}
5880
5881PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005882PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005883{
Victor Stinnerb960b342011-11-20 19:12:52 +01005884 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005885}
5886
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887/* --- UTF-16 Codec ------------------------------------------------------- */
5888
Tim Peters772747b2001-08-09 22:21:55 +00005889PyObject *
5890PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 Py_ssize_t size,
5892 const char *errors,
5893 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894{
Walter Dörwald69652032004-09-07 20:24:22 +00005895 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5896}
5897
5898PyObject *
5899PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 Py_ssize_t size,
5901 const char *errors,
5902 int *byteorder,
5903 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005904{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005906 Py_ssize_t startinpos;
5907 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005908 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005909 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005910 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005911 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005912 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 PyObject *errorHandler = NULL;
5914 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005915 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
Andy Lestere6be9b52020-02-11 20:28:35 -06005917 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005918 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919
5920 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005921 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005923 /* Check for BOM marks (U+FEFF) in the input and adjust current
5924 byte order setting accordingly. In native mode, the leading BOM
5925 mark is skipped, in all other modes, it is copied to the output
5926 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005927 if (bo == 0 && size >= 2) {
5928 const Py_UCS4 bom = (q[1] << 8) | q[0];
5929 if (bom == 0xFEFF) {
5930 q += 2;
5931 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005933 else if (bom == 0xFFFE) {
5934 q += 2;
5935 bo = 1;
5936 }
5937 if (byteorder)
5938 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005939 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
Antoine Pitrou63065d72012-05-15 23:48:04 +02005941 if (q == e) {
5942 if (consumed)
5943 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005944 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005945 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005946
Christian Heimes743e0cd2012-10-17 23:52:17 +02005947#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005948 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005949 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005950#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005951 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005952 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005953#endif
Tim Peters772747b2001-08-09 22:21:55 +00005954
Antoine Pitrou63065d72012-05-15 23:48:04 +02005955 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005956 character count normally. Error handler will take care of
5957 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005958 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005959 writer.min_length = (e - q + 1) / 2;
5960 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005961 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005962
Antoine Pitrou63065d72012-05-15 23:48:04 +02005963 while (1) {
5964 Py_UCS4 ch = 0;
5965 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005966 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005967 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005968 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005969 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005970 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005971 native_ordering);
5972 else
5973 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005974 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005975 native_ordering);
5976 } else if (kind == PyUnicode_2BYTE_KIND) {
5977 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005978 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005979 native_ordering);
5980 } else {
5981 assert(kind == PyUnicode_4BYTE_KIND);
5982 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005983 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005984 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005985 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005986 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987
Antoine Pitrou63065d72012-05-15 23:48:04 +02005988 switch (ch)
5989 {
5990 case 0:
5991 /* remaining byte at the end? (size should be even) */
5992 if (q == e || consumed)
5993 goto End;
5994 errmsg = "truncated data";
5995 startinpos = ((const char *)q) - starts;
5996 endinpos = ((const char *)e) - starts;
5997 break;
5998 /* The remaining input chars are ignored if the callback
5999 chooses to skip the input */
6000 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006001 q -= 2;
6002 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006003 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006004 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006005 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006006 endinpos = ((const char *)e) - starts;
6007 break;
6008 case 2:
6009 errmsg = "illegal encoding";
6010 startinpos = ((const char *)q) - 2 - starts;
6011 endinpos = startinpos + 2;
6012 break;
6013 case 3:
6014 errmsg = "illegal UTF-16 surrogate";
6015 startinpos = ((const char *)q) - 4 - starts;
6016 endinpos = startinpos + 2;
6017 break;
6018 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006019 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006020 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 continue;
6022 }
6023
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006024 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006025 errors,
6026 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006027 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006028 &starts,
6029 (const char **)&e,
6030 &startinpos,
6031 &endinpos,
6032 &exc,
6033 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006034 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 }
6037
Antoine Pitrou63065d72012-05-15 23:48:04 +02006038End:
Walter Dörwald69652032004-09-07 20:24:22 +00006039 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 Py_XDECREF(errorHandler);
6043 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006044 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006047 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048 Py_XDECREF(errorHandler);
6049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 return NULL;
6051}
6052
Tim Peters772747b2001-08-09 22:21:55 +00006053PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006054_PyUnicode_EncodeUTF16(PyObject *str,
6055 const char *errors,
6056 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006058 enum PyUnicode_Kind kind;
6059 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006060 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006061 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006062 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006063 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006064#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006065 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006066#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006067 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006068#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006069 const char *encoding;
6070 Py_ssize_t nsize, pos;
6071 PyObject *errorHandler = NULL;
6072 PyObject *exc = NULL;
6073 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006074
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006075 if (!PyUnicode_Check(str)) {
6076 PyErr_BadArgument();
6077 return NULL;
6078 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006079 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006080 return NULL;
6081 kind = PyUnicode_KIND(str);
6082 data = PyUnicode_DATA(str);
6083 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006084
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006085 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006086 if (kind == PyUnicode_4BYTE_KIND) {
6087 const Py_UCS4 *in = (const Py_UCS4 *)data;
6088 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006089 while (in < end) {
6090 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006091 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006092 }
6093 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006094 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006095 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006097 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006098 nsize = len + pairs + (byteorder == 0);
6099 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006100 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006104 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006105 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006106 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006107 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006108 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006109 }
6110 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006111 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006112 }
Tim Peters772747b2001-08-09 22:21:55 +00006113
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006114 if (kind == PyUnicode_1BYTE_KIND) {
6115 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6116 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006117 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006118
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006119 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006120 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006121 }
6122 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006123 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006124 }
6125 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006126 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006127 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006128
6129 pos = 0;
6130 while (pos < len) {
6131 Py_ssize_t repsize, moreunits;
6132
6133 if (kind == PyUnicode_2BYTE_KIND) {
6134 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6135 &out, native_ordering);
6136 }
6137 else {
6138 assert(kind == PyUnicode_4BYTE_KIND);
6139 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6140 &out, native_ordering);
6141 }
6142 if (pos == len)
6143 break;
6144
6145 rep = unicode_encode_call_errorhandler(
6146 errors, &errorHandler,
6147 encoding, "surrogates not allowed",
6148 str, &exc, pos, pos + 1, &pos);
6149 if (!rep)
6150 goto error;
6151
6152 if (PyBytes_Check(rep)) {
6153 repsize = PyBytes_GET_SIZE(rep);
6154 if (repsize & 1) {
6155 raise_encode_exception(&exc, encoding,
6156 str, pos - 1, pos,
6157 "surrogates not allowed");
6158 goto error;
6159 }
6160 moreunits = repsize / 2;
6161 }
6162 else {
6163 assert(PyUnicode_Check(rep));
6164 if (PyUnicode_READY(rep) < 0)
6165 goto error;
6166 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6167 if (!PyUnicode_IS_ASCII(rep)) {
6168 raise_encode_exception(&exc, encoding,
6169 str, pos - 1, pos,
6170 "surrogates not allowed");
6171 goto error;
6172 }
6173 }
6174
6175 /* two bytes are reserved for each surrogate */
6176 if (moreunits > 1) {
6177 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006178 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006179 /* integer overflow */
6180 PyErr_NoMemory();
6181 goto error;
6182 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006183 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006184 goto error;
6185 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6186 }
6187
6188 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006189 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006190 out += moreunits;
6191 } else /* rep is unicode */ {
6192 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6193 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6194 &out, native_ordering);
6195 }
6196
6197 Py_CLEAR(rep);
6198 }
6199
6200 /* Cut back to size actually needed. This is necessary for, for example,
6201 encoding of a string containing isolated surrogates and the 'ignore' handler
6202 is used. */
6203 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6204 if (nsize != PyBytes_GET_SIZE(v))
6205 _PyBytes_Resize(&v, nsize);
6206 Py_XDECREF(errorHandler);
6207 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006208 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006209 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006210 error:
6211 Py_XDECREF(rep);
6212 Py_XDECREF(errorHandler);
6213 Py_XDECREF(exc);
6214 Py_XDECREF(v);
6215 return NULL;
6216#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217}
6218
Alexander Belopolsky40018472011-02-26 01:02:56 +00006219PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006220PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6221 Py_ssize_t size,
6222 const char *errors,
6223 int byteorder)
6224{
6225 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006226 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006227 if (tmp == NULL)
6228 return NULL;
6229 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6230 Py_DECREF(tmp);
6231 return result;
6232}
6233
6234PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006235PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006237 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238}
6239
6240/* --- Unicode Escape Codec ----------------------------------------------- */
6241
Fredrik Lundh06d12682001-01-24 07:59:11 +00006242static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006243
Alexander Belopolsky40018472011-02-26 01:02:56 +00006244PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006245_PyUnicode_DecodeUnicodeEscape(const char *s,
6246 Py_ssize_t size,
6247 const char *errors,
6248 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006250 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006251 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006253 PyObject *errorHandler = NULL;
6254 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006255
Eric V. Smith42454af2016-10-31 09:22:08 -04006256 // so we can remember if we've seen an invalid escape char or not
6257 *first_invalid_escape = NULL;
6258
Victor Stinner62ec3312016-09-06 17:04:34 -07006259 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006260 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 }
6262 /* Escaped strings will always be longer than the resulting
6263 Unicode string, so we start with size here and then reduce the
6264 length after conversion to the true value.
6265 (but if the error callback returns a long replacement string
6266 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006267 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006268 writer.min_length = size;
6269 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6270 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006271 }
6272
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 end = s + size;
6274 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006275 unsigned char c = (unsigned char) *s++;
6276 Py_UCS4 ch;
6277 int count;
6278 Py_ssize_t startinpos;
6279 Py_ssize_t endinpos;
6280 const char *message;
6281
6282#define WRITE_ASCII_CHAR(ch) \
6283 do { \
6284 assert(ch <= 127); \
6285 assert(writer.pos < writer.size); \
6286 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6287 } while(0)
6288
6289#define WRITE_CHAR(ch) \
6290 do { \
6291 if (ch <= writer.maxchar) { \
6292 assert(writer.pos < writer.size); \
6293 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6294 } \
6295 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6296 goto onError; \
6297 } \
6298 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299
6300 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 if (c != '\\') {
6302 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 continue;
6304 }
6305
Victor Stinner62ec3312016-09-06 17:04:34 -07006306 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006308 if (s >= end) {
6309 message = "\\ at end of string";
6310 goto error;
6311 }
6312 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006313
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006315 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006318 case '\n': continue;
6319 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6320 case '\'': WRITE_ASCII_CHAR('\''); continue;
6321 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6322 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006323 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006324 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6325 case 't': WRITE_ASCII_CHAR('\t'); continue;
6326 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6327 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006328 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006329 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006330 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334 case '0': case '1': case '2': case '3':
6335 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006337 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006338 ch = (ch<<3) + *s++ - '0';
6339 if (s < end && '0' <= *s && *s <= '7') {
6340 ch = (ch<<3) + *s++ - '0';
6341 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006343 WRITE_CHAR(ch);
6344 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 /* hex escapes */
6347 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006349 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006350 message = "truncated \\xXX escape";
6351 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006355 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006356 message = "truncated \\uXXXX escape";
6357 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006360 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006361 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006362 message = "truncated \\UXXXXXXXX escape";
6363 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006364 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006365 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006366 ch <<= 4;
6367 if (c >= '0' && c <= '9') {
6368 ch += c - '0';
6369 }
6370 else if (c >= 'a' && c <= 'f') {
6371 ch += c - ('a' - 10);
6372 }
6373 else if (c >= 'A' && c <= 'F') {
6374 ch += c - ('A' - 10);
6375 }
6376 else {
6377 break;
6378 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006379 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006380 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006381 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006382 }
6383
6384 /* when we get here, ch is a 32-bit unicode character */
6385 if (ch > MAX_UNICODE) {
6386 message = "illegal Unicode character";
6387 goto error;
6388 }
6389
6390 WRITE_CHAR(ch);
6391 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006392
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006394 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006395 if (ucnhash_CAPI == NULL) {
6396 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006397 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6398 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 if (ucnhash_CAPI == NULL) {
6400 PyErr_SetString(
6401 PyExc_UnicodeError,
6402 "\\N escapes not supported (can't load unicodedata module)"
6403 );
6404 goto onError;
6405 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006406 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006407
6408 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006409 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 const char *start = ++s;
6411 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006412 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006414 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006415 namelen = s - start;
6416 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006417 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006418 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 ch = 0xffffffff; /* in case 'getcode' messes up */
6420 if (namelen <= INT_MAX &&
6421 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6422 &ch, 0)) {
6423 assert(ch <= MAX_UNICODE);
6424 WRITE_CHAR(ch);
6425 continue;
6426 }
6427 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006428 }
6429 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006430 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006431
6432 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006433 if (*first_invalid_escape == NULL) {
6434 *first_invalid_escape = s-1; /* Back up one char, since we've
6435 already incremented s. */
6436 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 WRITE_ASCII_CHAR('\\');
6438 WRITE_CHAR(c);
6439 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006441
6442 error:
6443 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006444 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006445 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006446 errors, &errorHandler,
6447 "unicodeescape", message,
6448 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006449 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006450 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006452 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006453
6454#undef WRITE_ASCII_CHAR
6455#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006457
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006458 Py_XDECREF(errorHandler);
6459 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006460 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006461
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006463 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 Py_XDECREF(errorHandler);
6465 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 return NULL;
6467}
6468
Eric V. Smith42454af2016-10-31 09:22:08 -04006469PyObject *
6470PyUnicode_DecodeUnicodeEscape(const char *s,
6471 Py_ssize_t size,
6472 const char *errors)
6473{
6474 const char *first_invalid_escape;
6475 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6476 &first_invalid_escape);
6477 if (result == NULL)
6478 return NULL;
6479 if (first_invalid_escape != NULL) {
6480 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6481 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006482 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006483 Py_DECREF(result);
6484 return NULL;
6485 }
6486 }
6487 return result;
6488}
6489
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006490/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491
Alexander Belopolsky40018472011-02-26 01:02:56 +00006492PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006495 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006496 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006498 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006499 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006500 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501
Ezio Melottie7f90372012-10-05 03:33:31 +03006502 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006503 escape.
6504
Ezio Melottie7f90372012-10-05 03:33:31 +03006505 For UCS1 strings it's '\xxx', 4 bytes per source character.
6506 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6507 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006508 */
6509
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006510 if (!PyUnicode_Check(unicode)) {
6511 PyErr_BadArgument();
6512 return NULL;
6513 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006514 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006515 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006516 }
Victor Stinner358af132015-10-12 22:36:57 +02006517
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006518 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006519 if (len == 0) {
6520 return PyBytes_FromStringAndSize(NULL, 0);
6521 }
6522
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006523 kind = PyUnicode_KIND(unicode);
6524 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006525 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6526 bytes, and 1 byte characters 4. */
6527 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006528 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006529 return PyErr_NoMemory();
6530 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006531 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006532 if (repr == NULL) {
6533 return NULL;
6534 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006535
Victor Stinner62ec3312016-09-06 17:04:34 -07006536 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006537 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006538 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006539
Victor Stinner62ec3312016-09-06 17:04:34 -07006540 /* U+0000-U+00ff range */
6541 if (ch < 0x100) {
6542 if (ch >= ' ' && ch < 127) {
6543 if (ch != '\\') {
6544 /* Copy printable US ASCII as-is */
6545 *p++ = (char) ch;
6546 }
6547 /* Escape backslashes */
6548 else {
6549 *p++ = '\\';
6550 *p++ = '\\';
6551 }
6552 }
Victor Stinner358af132015-10-12 22:36:57 +02006553
Victor Stinner62ec3312016-09-06 17:04:34 -07006554 /* Map special whitespace to '\t', \n', '\r' */
6555 else if (ch == '\t') {
6556 *p++ = '\\';
6557 *p++ = 't';
6558 }
6559 else if (ch == '\n') {
6560 *p++ = '\\';
6561 *p++ = 'n';
6562 }
6563 else if (ch == '\r') {
6564 *p++ = '\\';
6565 *p++ = 'r';
6566 }
6567
6568 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6569 else {
6570 *p++ = '\\';
6571 *p++ = 'x';
6572 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6573 *p++ = Py_hexdigits[ch & 0x000F];
6574 }
Tim Petersced69f82003-09-16 20:30:58 +00006575 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006576 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006577 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 *p++ = '\\';
6579 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006580 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6581 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6582 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6583 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6586 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006587
Victor Stinner62ec3312016-09-06 17:04:34 -07006588 /* Make sure that the first two digits are zero */
6589 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006590 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006591 *p++ = 'U';
6592 *p++ = '0';
6593 *p++ = '0';
6594 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6595 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6596 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6597 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6598 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6599 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602
Victor Stinner62ec3312016-09-06 17:04:34 -07006603 assert(p - PyBytes_AS_STRING(repr) > 0);
6604 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6605 return NULL;
6606 }
6607 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608}
6609
Alexander Belopolsky40018472011-02-26 01:02:56 +00006610PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006611PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6612 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006614 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006615 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006616 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006618 }
6619
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006620 result = PyUnicode_AsUnicodeEscapeString(tmp);
6621 Py_DECREF(tmp);
6622 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623}
6624
6625/* --- Raw Unicode Escape Codec ------------------------------------------- */
6626
Alexander Belopolsky40018472011-02-26 01:02:56 +00006627PyObject *
6628PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006629 Py_ssize_t size,
6630 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006633 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635 PyObject *errorHandler = NULL;
6636 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006637
Victor Stinner62ec3312016-09-06 17:04:34 -07006638 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006639 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006640 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006641
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 /* Escaped strings will always be longer than the resulting
6643 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 length after conversion to the true value. (But decoding error
6645 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006646 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006647 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006648 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6649 goto onError;
6650 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006651
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 end = s + size;
6653 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006654 unsigned char c = (unsigned char) *s++;
6655 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006656 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006657 Py_ssize_t startinpos;
6658 Py_ssize_t endinpos;
6659 const char *message;
6660
6661#define WRITE_CHAR(ch) \
6662 do { \
6663 if (ch <= writer.maxchar) { \
6664 assert(writer.pos < writer.size); \
6665 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6666 } \
6667 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6668 goto onError; \
6669 } \
6670 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006673 if (c != '\\' || s >= end) {
6674 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006676 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006677
Victor Stinner62ec3312016-09-06 17:04:34 -07006678 c = (unsigned char) *s++;
6679 if (c == 'u') {
6680 count = 4;
6681 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006683 else if (c == 'U') {
6684 count = 8;
6685 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006686 }
6687 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006688 assert(writer.pos < writer.size);
6689 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6690 WRITE_CHAR(c);
6691 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006692 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006693 startinpos = s - starts - 2;
6694
6695 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6696 for (ch = 0; count && s < end; ++s, --count) {
6697 c = (unsigned char)*s;
6698 ch <<= 4;
6699 if (c >= '0' && c <= '9') {
6700 ch += c - '0';
6701 }
6702 else if (c >= 'a' && c <= 'f') {
6703 ch += c - ('a' - 10);
6704 }
6705 else if (c >= 'A' && c <= 'F') {
6706 ch += c - ('A' - 10);
6707 }
6708 else {
6709 break;
6710 }
6711 }
6712 if (!count) {
6713 if (ch <= MAX_UNICODE) {
6714 WRITE_CHAR(ch);
6715 continue;
6716 }
6717 message = "\\Uxxxxxxxx out of range";
6718 }
6719
6720 endinpos = s-starts;
6721 writer.min_length = end - s + writer.pos;
6722 if (unicode_decode_call_errorhandler_writer(
6723 errors, &errorHandler,
6724 "rawunicodeescape", message,
6725 &starts, &end, &startinpos, &endinpos, &exc, &s,
6726 &writer)) {
6727 goto onError;
6728 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006729 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006730
6731#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733 Py_XDECREF(errorHandler);
6734 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006735 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006736
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006738 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 Py_XDECREF(errorHandler);
6740 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006742
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743}
6744
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006745
Alexander Belopolsky40018472011-02-26 01:02:56 +00006746PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006747PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748{
Victor Stinner62ec3312016-09-06 17:04:34 -07006749 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006751 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006752 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006753 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006754 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006756 if (!PyUnicode_Check(unicode)) {
6757 PyErr_BadArgument();
6758 return NULL;
6759 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006760 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006761 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006762 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006763 kind = PyUnicode_KIND(unicode);
6764 data = PyUnicode_DATA(unicode);
6765 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006766 if (kind == PyUnicode_1BYTE_KIND) {
6767 return PyBytes_FromStringAndSize(data, len);
6768 }
Victor Stinner0e368262011-11-10 20:12:49 +01006769
Victor Stinner62ec3312016-09-06 17:04:34 -07006770 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6771 bytes, and 1 byte characters 4. */
6772 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006773
Victor Stinner62ec3312016-09-06 17:04:34 -07006774 if (len > PY_SSIZE_T_MAX / expandsize) {
6775 return PyErr_NoMemory();
6776 }
6777 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6778 if (repr == NULL) {
6779 return NULL;
6780 }
6781 if (len == 0) {
6782 return repr;
6783 }
6784
6785 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006786 for (pos = 0; pos < len; pos++) {
6787 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006788
Victor Stinner62ec3312016-09-06 17:04:34 -07006789 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6790 if (ch < 0x100) {
6791 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006792 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006793 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006794 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 *p++ = '\\';
6796 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006797 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6798 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6799 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6800 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006802 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6803 else {
6804 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6805 *p++ = '\\';
6806 *p++ = 'U';
6807 *p++ = '0';
6808 *p++ = '0';
6809 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6810 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6811 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6812 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6813 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6814 *p++ = Py_hexdigits[ch & 15];
6815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006817
Victor Stinner62ec3312016-09-06 17:04:34 -07006818 assert(p > PyBytes_AS_STRING(repr));
6819 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6820 return NULL;
6821 }
6822 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823}
6824
Alexander Belopolsky40018472011-02-26 01:02:56 +00006825PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006826PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6827 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006829 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006830 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006831 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006832 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006833 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6834 Py_DECREF(tmp);
6835 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836}
6837
6838/* --- Latin-1 Codec ------------------------------------------------------ */
6839
Alexander Belopolsky40018472011-02-26 01:02:56 +00006840PyObject *
6841PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006842 Py_ssize_t size,
6843 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006846 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847}
6848
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006849/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006850static void
6851make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006852 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006853 PyObject *unicode,
6854 Py_ssize_t startpos, Py_ssize_t endpos,
6855 const char *reason)
6856{
6857 if (*exceptionObject == NULL) {
6858 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006859 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006860 encoding, unicode, startpos, endpos, reason);
6861 }
6862 else {
6863 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6864 goto onError;
6865 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6866 goto onError;
6867 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6868 goto onError;
6869 return;
6870 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006871 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006872 }
6873}
6874
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006875/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006876static void
6877raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006878 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006879 PyObject *unicode,
6880 Py_ssize_t startpos, Py_ssize_t endpos,
6881 const char *reason)
6882{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006883 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006884 encoding, unicode, startpos, endpos, reason);
6885 if (*exceptionObject != NULL)
6886 PyCodec_StrictErrors(*exceptionObject);
6887}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006888
6889/* error handling callback helper:
6890 build arguments, call the callback and check the arguments,
6891 put the result into newpos and return the replacement string, which
6892 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006893static PyObject *
6894unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006895 PyObject **errorHandler,
6896 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006897 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006898 Py_ssize_t startpos, Py_ssize_t endpos,
6899 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006900{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006901 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006902 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006903 PyObject *restuple;
6904 PyObject *resunicode;
6905
6906 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006908 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006910 }
6911
Benjamin Petersonbac79492012-01-14 13:34:47 -05006912 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006913 return NULL;
6914 len = PyUnicode_GET_LENGTH(unicode);
6915
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006916 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006917 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006918 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920
Petr Viktorinffd97532020-02-11 17:46:57 +01006921 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006924 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006925 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 Py_DECREF(restuple);
6927 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006929 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 &resunicode, newpos)) {
6931 Py_DECREF(restuple);
6932 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006933 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006934 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6935 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6936 Py_DECREF(restuple);
6937 return NULL;
6938 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006940 *newpos = len + *newpos;
6941 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006942 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 Py_DECREF(restuple);
6944 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 Py_INCREF(resunicode);
6947 Py_DECREF(restuple);
6948 return resunicode;
6949}
6950
Alexander Belopolsky40018472011-02-26 01:02:56 +00006951static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006952unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006953 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006954 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006955{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006956 /* input state */
6957 Py_ssize_t pos=0, size;
6958 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006959 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006960 /* pointer into the output */
6961 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006962 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6963 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006964 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006965 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006966 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006967 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006968 /* output object */
6969 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006970
Benjamin Petersonbac79492012-01-14 13:34:47 -05006971 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006972 return NULL;
6973 size = PyUnicode_GET_LENGTH(unicode);
6974 kind = PyUnicode_KIND(unicode);
6975 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006976 /* allocate enough for a simple encoding without
6977 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006978 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006979 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006980
6981 _PyBytesWriter_Init(&writer);
6982 str = _PyBytesWriter_Alloc(&writer, size);
6983 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006984 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006985
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006986 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006987 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006988
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006990 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006992 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006993 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006994 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006996 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006998 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006999 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007001
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007002 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007004
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007005 /* Only overallocate the buffer if it's not the last write */
7006 writer.overallocate = (collend < size);
7007
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007009 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007010 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007011
7012 switch (error_handler) {
7013 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007014 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007016
7017 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007018 memset(str, '?', collend - collstart);
7019 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007020 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007021 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007022 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 break;
Victor Stinner50149202015-09-22 00:26:54 +02007024
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007025 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007026 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007027 writer.min_size -= (collend - collstart);
7028 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007029 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007030 if (str == NULL)
7031 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007032 pos = collend;
7033 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007034
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007035 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007036 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007037 writer.min_size -= (collend - collstart);
7038 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007039 unicode, collstart, collend);
7040 if (str == NULL)
7041 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007042 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 break;
Victor Stinner50149202015-09-22 00:26:54 +02007044
Victor Stinnerc3713e92015-09-29 12:32:13 +02007045 case _Py_ERROR_SURROGATEESCAPE:
7046 for (i = collstart; i < collend; ++i) {
7047 ch = PyUnicode_READ(kind, data, i);
7048 if (ch < 0xdc80 || 0xdcff < ch) {
7049 /* Not a UTF-8b surrogate */
7050 break;
7051 }
7052 *str++ = (char)(ch - 0xdc00);
7053 ++pos;
7054 }
7055 if (i >= collend)
7056 break;
7057 collstart = pos;
7058 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007059 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007060
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007062 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7063 encoding, reason, unicode, &exc,
7064 collstart, collend, &newpos);
7065 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007067
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007068 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007069 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007070
Victor Stinner6bd525b2015-10-09 13:10:05 +02007071 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007072 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007073 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007074 PyBytes_AS_STRING(rep),
7075 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007076 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007077 else {
7078 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007079
Victor Stinner6bd525b2015-10-09 13:10:05 +02007080 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007082
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007083 if (limit == 256 ?
7084 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7085 !PyUnicode_IS_ASCII(rep))
7086 {
7087 /* Not all characters are smaller than limit */
7088 raise_encode_exception(&exc, encoding, unicode,
7089 collstart, collend, reason);
7090 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007092 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7093 str = _PyBytesWriter_WriteBytes(&writer, str,
7094 PyUnicode_DATA(rep),
7095 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007096 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007097 if (str == NULL)
7098 goto onError;
7099
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007100 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007101 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007102 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007103
7104 /* If overallocation was disabled, ensure that it was the last
7105 write. Otherwise, we missed an optimization */
7106 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007107 }
7108 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007109
Victor Stinner50149202015-09-22 00:26:54 +02007110 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007111 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007112 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007113
7114 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007115 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007116 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007117 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007118 Py_XDECREF(exc);
7119 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007120}
7121
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007122/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007123PyObject *
7124PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007125 Py_ssize_t size,
7126 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007128 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007129 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007130 if (unicode == NULL)
7131 return NULL;
7132 result = unicode_encode_ucs1(unicode, errors, 256);
7133 Py_DECREF(unicode);
7134 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135}
7136
Alexander Belopolsky40018472011-02-26 01:02:56 +00007137PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007138_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139{
7140 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 PyErr_BadArgument();
7142 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007144 if (PyUnicode_READY(unicode) == -1)
7145 return NULL;
7146 /* Fast path: if it is a one-byte string, construct
7147 bytes object directly. */
7148 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7149 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7150 PyUnicode_GET_LENGTH(unicode));
7151 /* Non-Latin-1 characters present. Defer to above function to
7152 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007153 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007154}
7155
7156PyObject*
7157PyUnicode_AsLatin1String(PyObject *unicode)
7158{
7159 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160}
7161
7162/* --- 7-bit ASCII Codec -------------------------------------------------- */
7163
Alexander Belopolsky40018472011-02-26 01:02:56 +00007164PyObject *
7165PyUnicode_DecodeASCII(const char *s,
7166 Py_ssize_t size,
7167 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007169 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007170 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007171 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007172 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007173 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007174
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007176 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007177
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007179 if (size == 1 && (unsigned char)s[0] < 128)
7180 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007181
Inada Naoki770847a2019-06-24 12:30:24 +09007182 // Shortcut for simple case
7183 PyObject *u = PyUnicode_New(size, 127);
7184 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007185 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007186 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007187 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007188 if (outpos == size) {
7189 return u;
7190 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007191
Inada Naoki770847a2019-06-24 12:30:24 +09007192 _PyUnicodeWriter writer;
7193 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007194 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007195
Inada Naoki770847a2019-06-24 12:30:24 +09007196 s += outpos;
7197 int kind = writer.kind;
7198 void *data = writer.data;
7199 Py_ssize_t startinpos, endinpos;
7200
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007201 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007202 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007204 PyUnicode_WRITE(kind, data, writer.pos, c);
7205 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007207 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007209
7210 /* byte outsize range 0x00..0x7f: call the error handler */
7211
7212 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007213 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007214
7215 switch (error_handler)
7216 {
7217 case _Py_ERROR_REPLACE:
7218 case _Py_ERROR_SURROGATEESCAPE:
7219 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007220 but we may switch to UCS2 at the first write */
7221 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7222 goto onError;
7223 kind = writer.kind;
7224 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007225
7226 if (error_handler == _Py_ERROR_REPLACE)
7227 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7228 else
7229 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7230 writer.pos++;
7231 ++s;
7232 break;
7233
7234 case _Py_ERROR_IGNORE:
7235 ++s;
7236 break;
7237
7238 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007239 startinpos = s-starts;
7240 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007241 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007242 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 "ascii", "ordinal not in range(128)",
7244 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007245 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007247 kind = writer.kind;
7248 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007251 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007253 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007254
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007256 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007257 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007258 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 return NULL;
7260}
7261
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007262/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007263PyObject *
7264PyUnicode_EncodeASCII(const Py_UNICODE *p,
7265 Py_ssize_t size,
7266 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007268 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007269 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007270 if (unicode == NULL)
7271 return NULL;
7272 result = unicode_encode_ucs1(unicode, errors, 128);
7273 Py_DECREF(unicode);
7274 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275}
7276
Alexander Belopolsky40018472011-02-26 01:02:56 +00007277PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007278_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279{
7280 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007281 PyErr_BadArgument();
7282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007284 if (PyUnicode_READY(unicode) == -1)
7285 return NULL;
7286 /* Fast path: if it is an ASCII-only string, construct bytes object
7287 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007288 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007289 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7290 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007291 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007292}
7293
7294PyObject *
7295PyUnicode_AsASCIIString(PyObject *unicode)
7296{
7297 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298}
7299
Steve Dowercc16be82016-09-08 10:35:16 -07007300#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007301
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007302/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007303
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007304#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305#define NEED_RETRY
7306#endif
7307
Steve Dower7ebdda02019-08-21 16:22:33 -07007308/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7309 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7310 both cases also and avoids partial characters overrunning the
7311 length limit in MultiByteToWideChar on Windows */
7312#define DECODING_CHUNK_SIZE (INT_MAX/4)
7313
Victor Stinner3a50e702011-10-18 21:21:00 +02007314#ifndef WC_ERR_INVALID_CHARS
7315# define WC_ERR_INVALID_CHARS 0x0080
7316#endif
7317
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007318static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007319code_page_name(UINT code_page, PyObject **obj)
7320{
7321 *obj = NULL;
7322 if (code_page == CP_ACP)
7323 return "mbcs";
7324 if (code_page == CP_UTF7)
7325 return "CP_UTF7";
7326 if (code_page == CP_UTF8)
7327 return "CP_UTF8";
7328
7329 *obj = PyBytes_FromFormat("cp%u", code_page);
7330 if (*obj == NULL)
7331 return NULL;
7332 return PyBytes_AS_STRING(*obj);
7333}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007334
Victor Stinner3a50e702011-10-18 21:21:00 +02007335static DWORD
7336decode_code_page_flags(UINT code_page)
7337{
7338 if (code_page == CP_UTF7) {
7339 /* The CP_UTF7 decoder only supports flags=0 */
7340 return 0;
7341 }
7342 else
7343 return MB_ERR_INVALID_CHARS;
7344}
7345
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007346/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007347 * Decode a byte string from a Windows code page into unicode object in strict
7348 * mode.
7349 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007350 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7351 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007352 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007353static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007354decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007355 wchar_t **buf,
7356 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 const char *in,
7358 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007360 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007361 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007362 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363
7364 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007366 while ((outsize = MultiByteToWideChar(code_page, flags,
7367 in, insize, NULL, 0)) <= 0)
7368 {
7369 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7370 goto error;
7371 }
7372 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7373 flags = 0;
7374 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007376 /* Extend a wchar_t* buffer */
7377 Py_ssize_t n = *bufsize; /* Get the current length */
7378 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7379 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007380 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007381 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007382
7383 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7385 if (outsize <= 0)
7386 goto error;
7387 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007388
Victor Stinner3a50e702011-10-18 21:21:00 +02007389error:
7390 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7391 return -2;
7392 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007393 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007394}
7395
Victor Stinner3a50e702011-10-18 21:21:00 +02007396/*
7397 * Decode a byte string from a code page into unicode object with an error
7398 * handler.
7399 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007400 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007401 * UnicodeDecodeError exception and returns -1 on error.
7402 */
7403static int
7404decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007405 wchar_t **buf,
7406 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007407 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007408 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007409{
7410 const char *startin = in;
7411 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007412 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 /* Ideally, we should get reason from FormatMessage. This is the Windows
7414 2000 English version of the message. */
7415 const char *reason = "No mapping for the Unicode character exists "
7416 "in the target code page.";
7417 /* each step cannot decode more than 1 character, but a character can be
7418 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007419 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007420 int insize;
7421 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 PyObject *errorHandler = NULL;
7423 PyObject *exc = NULL;
7424 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007425 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 DWORD err;
7427 int ret = -1;
7428
7429 assert(size > 0);
7430
7431 encoding = code_page_name(code_page, &encoding_obj);
7432 if (encoding == NULL)
7433 return -1;
7434
Victor Stinner7d00cc12014-03-17 23:08:06 +01007435 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7437 UnicodeDecodeError. */
7438 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7439 if (exc != NULL) {
7440 PyCodec_StrictErrors(exc);
7441 Py_CLEAR(exc);
7442 }
7443 goto error;
7444 }
7445
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007446 /* Extend a wchar_t* buffer */
7447 Py_ssize_t n = *bufsize; /* Get the current length */
7448 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7449 PyErr_NoMemory();
7450 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007452 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7453 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007455 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007456
7457 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 while (in < endin)
7459 {
7460 /* Decode a character */
7461 insize = 1;
7462 do
7463 {
7464 outsize = MultiByteToWideChar(code_page, flags,
7465 in, insize,
7466 buffer, Py_ARRAY_LENGTH(buffer));
7467 if (outsize > 0)
7468 break;
7469 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007470 if (err == ERROR_INVALID_FLAGS && flags) {
7471 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7472 flags = 0;
7473 continue;
7474 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 if (err != ERROR_NO_UNICODE_TRANSLATION
7476 && err != ERROR_INSUFFICIENT_BUFFER)
7477 {
7478 PyErr_SetFromWindowsErr(0);
7479 goto error;
7480 }
7481 insize++;
7482 }
7483 /* 4=maximum length of a UTF-8 sequence */
7484 while (insize <= 4 && (in + insize) <= endin);
7485
7486 if (outsize <= 0) {
7487 Py_ssize_t startinpos, endinpos, outpos;
7488
Victor Stinner7d00cc12014-03-17 23:08:06 +01007489 /* last character in partial decode? */
7490 if (in + insize >= endin && !final)
7491 break;
7492
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 startinpos = in - startin;
7494 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007495 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007496 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 errors, &errorHandler,
7498 encoding, reason,
7499 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007500 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 {
7502 goto error;
7503 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007504 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007505 }
7506 else {
7507 in += insize;
7508 memcpy(out, buffer, outsize * sizeof(wchar_t));
7509 out += outsize;
7510 }
7511 }
7512
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007513 /* Shrink the buffer */
7514 assert(out - *buf <= *bufsize);
7515 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007516 /* (in - startin) <= size and size is an int */
7517 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007518
7519error:
7520 Py_XDECREF(encoding_obj);
7521 Py_XDECREF(errorHandler);
7522 Py_XDECREF(exc);
7523 return ret;
7524}
7525
Victor Stinner3a50e702011-10-18 21:21:00 +02007526static PyObject *
7527decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007528 const char *s, Py_ssize_t size,
7529 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007530{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007531 wchar_t *buf = NULL;
7532 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007533 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007534
Victor Stinner3a50e702011-10-18 21:21:00 +02007535 if (code_page < 0) {
7536 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7537 return NULL;
7538 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007539 if (size < 0) {
7540 PyErr_BadInternalCall();
7541 return NULL;
7542 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007543
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007544 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007545 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007546
Victor Stinner76a31a62011-11-04 00:05:13 +01007547 do
7548 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007549#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007550 if (size > DECODING_CHUNK_SIZE) {
7551 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007552 final = 0;
7553 done = 0;
7554 }
7555 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007556#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007557 {
7558 chunk_size = (int)size;
7559 final = (consumed == NULL);
7560 done = 1;
7561 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007562
Victor Stinner76a31a62011-11-04 00:05:13 +01007563 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007564 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007565 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007566 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007567 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007568
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007569 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007570 s, chunk_size);
7571 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007572 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007573 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007574 errors, final);
7575 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007576
7577 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007578 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007579 return NULL;
7580 }
7581
7582 if (consumed)
7583 *consumed += converted;
7584
7585 s += converted;
7586 size -= converted;
7587 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007588
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007589 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7590 PyMem_Free(buf);
7591 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007592}
7593
Alexander Belopolsky40018472011-02-26 01:02:56 +00007594PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007595PyUnicode_DecodeCodePageStateful(int code_page,
7596 const char *s,
7597 Py_ssize_t size,
7598 const char *errors,
7599 Py_ssize_t *consumed)
7600{
7601 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7602}
7603
7604PyObject *
7605PyUnicode_DecodeMBCSStateful(const char *s,
7606 Py_ssize_t size,
7607 const char *errors,
7608 Py_ssize_t *consumed)
7609{
7610 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7611}
7612
7613PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007614PyUnicode_DecodeMBCS(const char *s,
7615 Py_ssize_t size,
7616 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007617{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007618 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7619}
7620
Victor Stinner3a50e702011-10-18 21:21:00 +02007621static DWORD
7622encode_code_page_flags(UINT code_page, const char *errors)
7623{
7624 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007625 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 }
7627 else if (code_page == CP_UTF7) {
7628 /* CP_UTF7 only supports flags=0 */
7629 return 0;
7630 }
7631 else {
7632 if (errors != NULL && strcmp(errors, "replace") == 0)
7633 return 0;
7634 else
7635 return WC_NO_BEST_FIT_CHARS;
7636 }
7637}
7638
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007639/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 * Encode a Unicode string to a Windows code page into a byte string in strict
7641 * mode.
7642 *
7643 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007644 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007645 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007646static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007647encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007648 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007650{
Victor Stinner554f3f02010-06-16 23:33:54 +00007651 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 BOOL *pusedDefaultChar = &usedDefaultChar;
7653 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007654 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007655 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 const DWORD flags = encode_code_page_flags(code_page, NULL);
7657 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007658 /* Create a substring so that we can get the UTF-16 representation
7659 of just the slice under consideration. */
7660 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007661
Martin v. Löwis3d325192011-11-04 18:23:06 +01007662 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007663
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007665 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007666 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007667 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007668
Victor Stinner2fc507f2011-11-04 20:06:39 +01007669 substring = PyUnicode_Substring(unicode, offset, offset+len);
7670 if (substring == NULL)
7671 return -1;
7672 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7673 if (p == NULL) {
7674 Py_DECREF(substring);
7675 return -1;
7676 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007677 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007678
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007679 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007681 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007682 NULL, 0,
7683 NULL, pusedDefaultChar);
7684 if (outsize <= 0)
7685 goto error;
7686 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007687 if (pusedDefaultChar && *pusedDefaultChar) {
7688 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007689 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007690 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007691
Victor Stinner3a50e702011-10-18 21:21:00 +02007692 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007695 if (*outbytes == NULL) {
7696 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007698 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007699 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007700 }
7701 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007703 const Py_ssize_t n = PyBytes_Size(*outbytes);
7704 if (outsize > PY_SSIZE_T_MAX - n) {
7705 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007706 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007708 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007709 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7710 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007711 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007712 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007713 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007714 }
7715
7716 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007717 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007718 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007719 out, outsize,
7720 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007721 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007722 if (outsize <= 0)
7723 goto error;
7724 if (pusedDefaultChar && *pusedDefaultChar)
7725 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007726 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007727
Victor Stinner3a50e702011-10-18 21:21:00 +02007728error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007729 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007730 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7731 return -2;
7732 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007733 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007734}
7735
Victor Stinner3a50e702011-10-18 21:21:00 +02007736/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007737 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007738 * error handler.
7739 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007740 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007741 * -1 on other error.
7742 */
7743static int
7744encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007745 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007746 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007747{
Victor Stinner3a50e702011-10-18 21:21:00 +02007748 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007749 Py_ssize_t pos = unicode_offset;
7750 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007751 /* Ideally, we should get reason from FormatMessage. This is the Windows
7752 2000 English version of the message. */
7753 const char *reason = "invalid character";
7754 /* 4=maximum length of a UTF-8 sequence */
7755 char buffer[4];
7756 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7757 Py_ssize_t outsize;
7758 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007759 PyObject *errorHandler = NULL;
7760 PyObject *exc = NULL;
7761 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007762 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007763 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007764 PyObject *rep;
7765 int ret = -1;
7766
7767 assert(insize > 0);
7768
7769 encoding = code_page_name(code_page, &encoding_obj);
7770 if (encoding == NULL)
7771 return -1;
7772
7773 if (errors == NULL || strcmp(errors, "strict") == 0) {
7774 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7775 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007776 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007777 if (exc != NULL) {
7778 PyCodec_StrictErrors(exc);
7779 Py_DECREF(exc);
7780 }
7781 Py_XDECREF(encoding_obj);
7782 return -1;
7783 }
7784
7785 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7786 pusedDefaultChar = &usedDefaultChar;
7787 else
7788 pusedDefaultChar = NULL;
7789
7790 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7791 PyErr_NoMemory();
7792 goto error;
7793 }
7794 outsize = insize * Py_ARRAY_LENGTH(buffer);
7795
7796 if (*outbytes == NULL) {
7797 /* Create string object */
7798 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7799 if (*outbytes == NULL)
7800 goto error;
7801 out = PyBytes_AS_STRING(*outbytes);
7802 }
7803 else {
7804 /* Extend string object */
7805 Py_ssize_t n = PyBytes_Size(*outbytes);
7806 if (n > PY_SSIZE_T_MAX - outsize) {
7807 PyErr_NoMemory();
7808 goto error;
7809 }
7810 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7811 goto error;
7812 out = PyBytes_AS_STRING(*outbytes) + n;
7813 }
7814
7815 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007816 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007817 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007818 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7819 wchar_t chars[2];
7820 int charsize;
7821 if (ch < 0x10000) {
7822 chars[0] = (wchar_t)ch;
7823 charsize = 1;
7824 }
7825 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007826 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7827 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007828 charsize = 2;
7829 }
7830
Victor Stinner3a50e702011-10-18 21:21:00 +02007831 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007832 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007833 buffer, Py_ARRAY_LENGTH(buffer),
7834 NULL, pusedDefaultChar);
7835 if (outsize > 0) {
7836 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7837 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007838 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007839 memcpy(out, buffer, outsize);
7840 out += outsize;
7841 continue;
7842 }
7843 }
7844 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7845 PyErr_SetFromWindowsErr(0);
7846 goto error;
7847 }
7848
Victor Stinner3a50e702011-10-18 21:21:00 +02007849 rep = unicode_encode_call_errorhandler(
7850 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007851 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007852 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007853 if (rep == NULL)
7854 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007855 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007856
7857 if (PyBytes_Check(rep)) {
7858 outsize = PyBytes_GET_SIZE(rep);
7859 if (outsize != 1) {
7860 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7861 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7862 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7863 Py_DECREF(rep);
7864 goto error;
7865 }
7866 out = PyBytes_AS_STRING(*outbytes) + offset;
7867 }
7868 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7869 out += outsize;
7870 }
7871 else {
7872 Py_ssize_t i;
7873 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007874 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007875
Benjamin Petersonbac79492012-01-14 13:34:47 -05007876 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007877 Py_DECREF(rep);
7878 goto error;
7879 }
7880
7881 outsize = PyUnicode_GET_LENGTH(rep);
7882 if (outsize != 1) {
7883 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7884 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7885 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7886 Py_DECREF(rep);
7887 goto error;
7888 }
7889 out = PyBytes_AS_STRING(*outbytes) + offset;
7890 }
7891 kind = PyUnicode_KIND(rep);
7892 data = PyUnicode_DATA(rep);
7893 for (i=0; i < outsize; i++) {
7894 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7895 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007896 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007897 encoding, unicode,
7898 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007899 "unable to encode error handler result to ASCII");
7900 Py_DECREF(rep);
7901 goto error;
7902 }
7903 *out = (unsigned char)ch;
7904 out++;
7905 }
7906 }
7907 Py_DECREF(rep);
7908 }
7909 /* write a NUL byte */
7910 *out = 0;
7911 outsize = out - PyBytes_AS_STRING(*outbytes);
7912 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7913 if (_PyBytes_Resize(outbytes, outsize) < 0)
7914 goto error;
7915 ret = 0;
7916
7917error:
7918 Py_XDECREF(encoding_obj);
7919 Py_XDECREF(errorHandler);
7920 Py_XDECREF(exc);
7921 return ret;
7922}
7923
Victor Stinner3a50e702011-10-18 21:21:00 +02007924static PyObject *
7925encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007926 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007927 const char *errors)
7928{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007929 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007930 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007931 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007932 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007933
Victor Stinner29dacf22015-01-26 16:41:32 +01007934 if (!PyUnicode_Check(unicode)) {
7935 PyErr_BadArgument();
7936 return NULL;
7937 }
7938
Benjamin Petersonbac79492012-01-14 13:34:47 -05007939 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007940 return NULL;
7941 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007942
Victor Stinner3a50e702011-10-18 21:21:00 +02007943 if (code_page < 0) {
7944 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7945 return NULL;
7946 }
7947
Martin v. Löwis3d325192011-11-04 18:23:06 +01007948 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007949 return PyBytes_FromStringAndSize(NULL, 0);
7950
Victor Stinner7581cef2011-11-03 22:32:33 +01007951 offset = 0;
7952 do
7953 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007954#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007955 if (len > DECODING_CHUNK_SIZE) {
7956 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007957 done = 0;
7958 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007959 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007960#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007961 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007962 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007963 done = 1;
7964 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007965
Victor Stinner76a31a62011-11-04 00:05:13 +01007966 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007967 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007968 errors);
7969 if (ret == -2)
7970 ret = encode_code_page_errors(code_page, &outbytes,
7971 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007972 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007973 if (ret < 0) {
7974 Py_XDECREF(outbytes);
7975 return NULL;
7976 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007977
Victor Stinner7581cef2011-11-03 22:32:33 +01007978 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007979 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007980 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007981
Victor Stinner3a50e702011-10-18 21:21:00 +02007982 return outbytes;
7983}
7984
7985PyObject *
7986PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7987 Py_ssize_t size,
7988 const char *errors)
7989{
Victor Stinner7581cef2011-11-03 22:32:33 +01007990 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007991 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007992 if (unicode == NULL)
7993 return NULL;
7994 res = encode_code_page(CP_ACP, unicode, errors);
7995 Py_DECREF(unicode);
7996 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007997}
7998
7999PyObject *
8000PyUnicode_EncodeCodePage(int code_page,
8001 PyObject *unicode,
8002 const char *errors)
8003{
Victor Stinner7581cef2011-11-03 22:32:33 +01008004 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008005}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008006
Alexander Belopolsky40018472011-02-26 01:02:56 +00008007PyObject *
8008PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008009{
Victor Stinner7581cef2011-11-03 22:32:33 +01008010 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008011}
8012
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008013#undef NEED_RETRY
8014
Steve Dowercc16be82016-09-08 10:35:16 -07008015#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008016
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017/* --- Character Mapping Codec -------------------------------------------- */
8018
Victor Stinnerfb161b12013-04-18 01:44:27 +02008019static int
8020charmap_decode_string(const char *s,
8021 Py_ssize_t size,
8022 PyObject *mapping,
8023 const char *errors,
8024 _PyUnicodeWriter *writer)
8025{
8026 const char *starts = s;
8027 const char *e;
8028 Py_ssize_t startinpos, endinpos;
8029 PyObject *errorHandler = NULL, *exc = NULL;
8030 Py_ssize_t maplen;
8031 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008032 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008033 Py_UCS4 x;
8034 unsigned char ch;
8035
8036 if (PyUnicode_READY(mapping) == -1)
8037 return -1;
8038
8039 maplen = PyUnicode_GET_LENGTH(mapping);
8040 mapdata = PyUnicode_DATA(mapping);
8041 mapkind = PyUnicode_KIND(mapping);
8042
8043 e = s + size;
8044
8045 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8046 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8047 * is disabled in encoding aliases, latin1 is preferred because
8048 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008049 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008050 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8051 Py_UCS4 maxchar = writer->maxchar;
8052
8053 assert (writer->kind == PyUnicode_1BYTE_KIND);
8054 while (s < e) {
8055 ch = *s;
8056 x = mapdata_ucs1[ch];
8057 if (x > maxchar) {
8058 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8059 goto onError;
8060 maxchar = writer->maxchar;
8061 outdata = (Py_UCS1 *)writer->data;
8062 }
8063 outdata[writer->pos] = x;
8064 writer->pos++;
8065 ++s;
8066 }
8067 return 0;
8068 }
8069
8070 while (s < e) {
8071 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8072 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008073 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008074 if (outkind == PyUnicode_1BYTE_KIND) {
8075 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8076 Py_UCS4 maxchar = writer->maxchar;
8077 while (s < e) {
8078 ch = *s;
8079 x = mapdata_ucs2[ch];
8080 if (x > maxchar)
8081 goto Error;
8082 outdata[writer->pos] = x;
8083 writer->pos++;
8084 ++s;
8085 }
8086 break;
8087 }
8088 else if (outkind == PyUnicode_2BYTE_KIND) {
8089 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8090 while (s < e) {
8091 ch = *s;
8092 x = mapdata_ucs2[ch];
8093 if (x == 0xFFFE)
8094 goto Error;
8095 outdata[writer->pos] = x;
8096 writer->pos++;
8097 ++s;
8098 }
8099 break;
8100 }
8101 }
8102 ch = *s;
8103
8104 if (ch < maplen)
8105 x = PyUnicode_READ(mapkind, mapdata, ch);
8106 else
8107 x = 0xfffe; /* invalid value */
8108Error:
8109 if (x == 0xfffe)
8110 {
8111 /* undefined mapping */
8112 startinpos = s-starts;
8113 endinpos = startinpos+1;
8114 if (unicode_decode_call_errorhandler_writer(
8115 errors, &errorHandler,
8116 "charmap", "character maps to <undefined>",
8117 &starts, &e, &startinpos, &endinpos, &exc, &s,
8118 writer)) {
8119 goto onError;
8120 }
8121 continue;
8122 }
8123
8124 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8125 goto onError;
8126 ++s;
8127 }
8128 Py_XDECREF(errorHandler);
8129 Py_XDECREF(exc);
8130 return 0;
8131
8132onError:
8133 Py_XDECREF(errorHandler);
8134 Py_XDECREF(exc);
8135 return -1;
8136}
8137
8138static int
8139charmap_decode_mapping(const char *s,
8140 Py_ssize_t size,
8141 PyObject *mapping,
8142 const char *errors,
8143 _PyUnicodeWriter *writer)
8144{
8145 const char *starts = s;
8146 const char *e;
8147 Py_ssize_t startinpos, endinpos;
8148 PyObject *errorHandler = NULL, *exc = NULL;
8149 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008150 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008151
8152 e = s + size;
8153
8154 while (s < e) {
8155 ch = *s;
8156
8157 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8158 key = PyLong_FromLong((long)ch);
8159 if (key == NULL)
8160 goto onError;
8161
8162 item = PyObject_GetItem(mapping, key);
8163 Py_DECREF(key);
8164 if (item == NULL) {
8165 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8166 /* No mapping found means: mapping is undefined. */
8167 PyErr_Clear();
8168 goto Undefined;
8169 } else
8170 goto onError;
8171 }
8172
8173 /* Apply mapping */
8174 if (item == Py_None)
8175 goto Undefined;
8176 if (PyLong_Check(item)) {
8177 long value = PyLong_AS_LONG(item);
8178 if (value == 0xFFFE)
8179 goto Undefined;
8180 if (value < 0 || value > MAX_UNICODE) {
8181 PyErr_Format(PyExc_TypeError,
8182 "character mapping must be in range(0x%lx)",
8183 (unsigned long)MAX_UNICODE + 1);
8184 goto onError;
8185 }
8186
8187 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8188 goto onError;
8189 }
8190 else if (PyUnicode_Check(item)) {
8191 if (PyUnicode_READY(item) == -1)
8192 goto onError;
8193 if (PyUnicode_GET_LENGTH(item) == 1) {
8194 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8195 if (value == 0xFFFE)
8196 goto Undefined;
8197 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8198 goto onError;
8199 }
8200 else {
8201 writer->overallocate = 1;
8202 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8203 goto onError;
8204 }
8205 }
8206 else {
8207 /* wrong return value */
8208 PyErr_SetString(PyExc_TypeError,
8209 "character mapping must return integer, None or str");
8210 goto onError;
8211 }
8212 Py_CLEAR(item);
8213 ++s;
8214 continue;
8215
8216Undefined:
8217 /* undefined mapping */
8218 Py_CLEAR(item);
8219 startinpos = s-starts;
8220 endinpos = startinpos+1;
8221 if (unicode_decode_call_errorhandler_writer(
8222 errors, &errorHandler,
8223 "charmap", "character maps to <undefined>",
8224 &starts, &e, &startinpos, &endinpos, &exc, &s,
8225 writer)) {
8226 goto onError;
8227 }
8228 }
8229 Py_XDECREF(errorHandler);
8230 Py_XDECREF(exc);
8231 return 0;
8232
8233onError:
8234 Py_XDECREF(item);
8235 Py_XDECREF(errorHandler);
8236 Py_XDECREF(exc);
8237 return -1;
8238}
8239
Alexander Belopolsky40018472011-02-26 01:02:56 +00008240PyObject *
8241PyUnicode_DecodeCharmap(const char *s,
8242 Py_ssize_t size,
8243 PyObject *mapping,
8244 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008246 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008247
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 /* Default to Latin-1 */
8249 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008253 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008254 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008255 writer.min_length = size;
8256 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008258
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008259 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008260 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8261 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008262 }
8263 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008264 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8265 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008267 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008268
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008270 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 return NULL;
8272}
8273
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008274/* Charmap encoding: the lookup table */
8275
Alexander Belopolsky40018472011-02-26 01:02:56 +00008276struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 PyObject_HEAD
8278 unsigned char level1[32];
8279 int count2, count3;
8280 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008281};
8282
8283static PyObject*
8284encoding_map_size(PyObject *obj, PyObject* args)
8285{
8286 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008287 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008289}
8290
8291static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008292 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 PyDoc_STR("Return the size (in bytes) of this object") },
8294 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008295};
8296
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008297static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008298 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 "EncodingMap", /*tp_name*/
8300 sizeof(struct encoding_map), /*tp_basicsize*/
8301 0, /*tp_itemsize*/
8302 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008303 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008304 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 0, /*tp_getattr*/
8306 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008307 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 0, /*tp_repr*/
8309 0, /*tp_as_number*/
8310 0, /*tp_as_sequence*/
8311 0, /*tp_as_mapping*/
8312 0, /*tp_hash*/
8313 0, /*tp_call*/
8314 0, /*tp_str*/
8315 0, /*tp_getattro*/
8316 0, /*tp_setattro*/
8317 0, /*tp_as_buffer*/
8318 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8319 0, /*tp_doc*/
8320 0, /*tp_traverse*/
8321 0, /*tp_clear*/
8322 0, /*tp_richcompare*/
8323 0, /*tp_weaklistoffset*/
8324 0, /*tp_iter*/
8325 0, /*tp_iternext*/
8326 encoding_map_methods, /*tp_methods*/
8327 0, /*tp_members*/
8328 0, /*tp_getset*/
8329 0, /*tp_base*/
8330 0, /*tp_dict*/
8331 0, /*tp_descr_get*/
8332 0, /*tp_descr_set*/
8333 0, /*tp_dictoffset*/
8334 0, /*tp_init*/
8335 0, /*tp_alloc*/
8336 0, /*tp_new*/
8337 0, /*tp_free*/
8338 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008339};
8340
8341PyObject*
8342PyUnicode_BuildEncodingMap(PyObject* string)
8343{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008344 PyObject *result;
8345 struct encoding_map *mresult;
8346 int i;
8347 int need_dict = 0;
8348 unsigned char level1[32];
8349 unsigned char level2[512];
8350 unsigned char *mlevel1, *mlevel2, *mlevel3;
8351 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008353 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008354 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008357 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008358 PyErr_BadArgument();
8359 return NULL;
8360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 kind = PyUnicode_KIND(string);
8362 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008363 length = PyUnicode_GET_LENGTH(string);
8364 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008365 memset(level1, 0xFF, sizeof level1);
8366 memset(level2, 0xFF, sizeof level2);
8367
8368 /* If there isn't a one-to-one mapping of NULL to \0,
8369 or if there are non-BMP characters, we need to use
8370 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008371 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008373 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008374 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 ch = PyUnicode_READ(kind, data, i);
8376 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008377 need_dict = 1;
8378 break;
8379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008381 /* unmapped character */
8382 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 l1 = ch >> 11;
8384 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008385 if (level1[l1] == 0xFF)
8386 level1[l1] = count2++;
8387 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008388 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008389 }
8390
8391 if (count2 >= 0xFF || count3 >= 0xFF)
8392 need_dict = 1;
8393
8394 if (need_dict) {
8395 PyObject *result = PyDict_New();
8396 PyObject *key, *value;
8397 if (!result)
8398 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008399 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008401 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008402 if (!key || !value)
8403 goto failed1;
8404 if (PyDict_SetItem(result, key, value) == -1)
8405 goto failed1;
8406 Py_DECREF(key);
8407 Py_DECREF(value);
8408 }
8409 return result;
8410 failed1:
8411 Py_XDECREF(key);
8412 Py_XDECREF(value);
8413 Py_DECREF(result);
8414 return NULL;
8415 }
8416
8417 /* Create a three-level trie */
8418 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8419 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008420 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008421 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008422 }
8423
8424 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008425 mresult = (struct encoding_map*)result;
8426 mresult->count2 = count2;
8427 mresult->count3 = count3;
8428 mlevel1 = mresult->level1;
8429 mlevel2 = mresult->level23;
8430 mlevel3 = mresult->level23 + 16*count2;
8431 memcpy(mlevel1, level1, 32);
8432 memset(mlevel2, 0xFF, 16*count2);
8433 memset(mlevel3, 0, 128*count3);
8434 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008435 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008436 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008437 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8438 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008439 /* unmapped character */
8440 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008441 o1 = ch>>11;
8442 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008443 i2 = 16*mlevel1[o1] + o2;
8444 if (mlevel2[i2] == 0xFF)
8445 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008446 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008447 i3 = 128*mlevel2[i2] + o3;
8448 mlevel3[i3] = i;
8449 }
8450 return result;
8451}
8452
8453static int
Victor Stinner22168992011-11-20 17:09:18 +01008454encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008455{
8456 struct encoding_map *map = (struct encoding_map*)mapping;
8457 int l1 = c>>11;
8458 int l2 = (c>>7) & 0xF;
8459 int l3 = c & 0x7F;
8460 int i;
8461
Victor Stinner22168992011-11-20 17:09:18 +01008462 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008464 if (c == 0)
8465 return 0;
8466 /* level 1*/
8467 i = map->level1[l1];
8468 if (i == 0xFF) {
8469 return -1;
8470 }
8471 /* level 2*/
8472 i = map->level23[16*i+l2];
8473 if (i == 0xFF) {
8474 return -1;
8475 }
8476 /* level 3 */
8477 i = map->level23[16*map->count2 + 128*i + l3];
8478 if (i == 0) {
8479 return -1;
8480 }
8481 return i;
8482}
8483
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008484/* Lookup the character ch in the mapping. If the character
8485 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008486 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008487static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008488charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489{
Christian Heimes217cfd12007-12-02 14:31:20 +00008490 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008491 PyObject *x;
8492
8493 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495 x = PyObject_GetItem(mapping, w);
8496 Py_DECREF(w);
8497 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8499 /* No mapping found means: mapping is undefined. */
8500 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008501 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 } else
8503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008505 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008507 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 long value = PyLong_AS_LONG(x);
8509 if (value < 0 || value > 255) {
8510 PyErr_SetString(PyExc_TypeError,
8511 "character mapping must be in range(256)");
8512 Py_DECREF(x);
8513 return NULL;
8514 }
8515 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008517 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 /* wrong return value */
8521 PyErr_Format(PyExc_TypeError,
8522 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008523 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 Py_DECREF(x);
8525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526 }
8527}
8528
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008529static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008530charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008531{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008532 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8533 /* exponentially overallocate to minimize reallocations */
8534 if (requiredsize < 2*outsize)
8535 requiredsize = 2*outsize;
8536 if (_PyBytes_Resize(outobj, requiredsize))
8537 return -1;
8538 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008539}
8540
Benjamin Peterson14339b62009-01-31 16:36:08 +00008541typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008543} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008545 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 space is available. Return a new reference to the object that
8547 was put in the output buffer, or Py_None, if the mapping was undefined
8548 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008549 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008550static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008551charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008552 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008554 PyObject *rep;
8555 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008556 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557
Andy Lesterdffe4c02020-03-04 07:15:20 -06008558 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008559 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008561 if (res == -1)
8562 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 if (outsize<requiredsize)
8564 if (charmapencode_resize(outobj, outpos, requiredsize))
8565 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008566 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 outstart[(*outpos)++] = (char)res;
8568 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008569 }
8570
8571 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008574 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 Py_DECREF(rep);
8576 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008577 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 if (PyLong_Check(rep)) {
8579 Py_ssize_t requiredsize = *outpos+1;
8580 if (outsize<requiredsize)
8581 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8582 Py_DECREF(rep);
8583 return enc_EXCEPTION;
8584 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008585 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008587 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 else {
8589 const char *repchars = PyBytes_AS_STRING(rep);
8590 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8591 Py_ssize_t requiredsize = *outpos+repsize;
8592 if (outsize<requiredsize)
8593 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8594 Py_DECREF(rep);
8595 return enc_EXCEPTION;
8596 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008597 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 memcpy(outstart + *outpos, repchars, repsize);
8599 *outpos += repsize;
8600 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008601 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008602 Py_DECREF(rep);
8603 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008604}
8605
8606/* handle an error in PyUnicode_EncodeCharmap
8607 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008608static int
8609charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008610 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008611 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008612 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008613 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008614{
8615 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008616 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008617 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008618 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008619 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008620 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008621 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008622 Py_ssize_t collstartpos = *inpos;
8623 Py_ssize_t collendpos = *inpos+1;
8624 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008625 const char *encoding = "charmap";
8626 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008627 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008628 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008629 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630
Benjamin Petersonbac79492012-01-14 13:34:47 -05008631 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008632 return -1;
8633 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634 /* find all unencodable characters */
8635 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008636 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008637 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008638 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008639 val = encoding_map_lookup(ch, mapping);
8640 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 break;
8642 ++collendpos;
8643 continue;
8644 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008645
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008646 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8647 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 if (rep==NULL)
8649 return -1;
8650 else if (rep!=Py_None) {
8651 Py_DECREF(rep);
8652 break;
8653 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008654 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008656 }
8657 /* cache callback name lookup
8658 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008659 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008660 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008661
8662 switch (*error_handler) {
8663 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008664 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008665 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008666
8667 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008668 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 x = charmapencode_output('?', mapping, res, respos);
8670 if (x==enc_EXCEPTION) {
8671 return -1;
8672 }
8673 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008674 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 return -1;
8676 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008677 }
8678 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008679 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008680 *inpos = collendpos;
8681 break;
Victor Stinner50149202015-09-22 00:26:54 +02008682
8683 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008684 /* generate replacement (temporarily (mis)uses p) */
8685 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 char buffer[2+29+1+1];
8687 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008688 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 for (cp = buffer; *cp; ++cp) {
8690 x = charmapencode_output(*cp, mapping, res, respos);
8691 if (x==enc_EXCEPTION)
8692 return -1;
8693 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008694 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 return -1;
8696 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008697 }
8698 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008699 *inpos = collendpos;
8700 break;
Victor Stinner50149202015-09-22 00:26:54 +02008701
Benjamin Peterson14339b62009-01-31 16:36:08 +00008702 default:
Victor Stinner50149202015-09-22 00:26:54 +02008703 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008704 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008706 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008708 if (PyBytes_Check(repunicode)) {
8709 /* Directly copy bytes result to output. */
8710 Py_ssize_t outsize = PyBytes_Size(*res);
8711 Py_ssize_t requiredsize;
8712 repsize = PyBytes_Size(repunicode);
8713 requiredsize = *respos + repsize;
8714 if (requiredsize > outsize)
8715 /* Make room for all additional bytes. */
8716 if (charmapencode_resize(res, respos, requiredsize)) {
8717 Py_DECREF(repunicode);
8718 return -1;
8719 }
8720 memcpy(PyBytes_AsString(*res) + *respos,
8721 PyBytes_AsString(repunicode), repsize);
8722 *respos += repsize;
8723 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008724 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008725 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008726 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008727 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008728 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008729 Py_DECREF(repunicode);
8730 return -1;
8731 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008732 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008733 data = PyUnicode_DATA(repunicode);
8734 kind = PyUnicode_KIND(repunicode);
8735 for (index = 0; index < repsize; index++) {
8736 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8737 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008739 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 return -1;
8741 }
8742 else if (x==enc_FAILED) {
8743 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008744 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 return -1;
8746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008747 }
8748 *inpos = newpos;
8749 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750 }
8751 return 0;
8752}
8753
Alexander Belopolsky40018472011-02-26 01:02:56 +00008754PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008755_PyUnicode_EncodeCharmap(PyObject *unicode,
8756 PyObject *mapping,
8757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008759 /* output object */
8760 PyObject *res = NULL;
8761 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008762 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008763 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008765 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008766 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008767 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008768 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008769 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008770 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771
Benjamin Petersonbac79492012-01-14 13:34:47 -05008772 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008773 return NULL;
8774 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008775 data = PyUnicode_DATA(unicode);
8776 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008777
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 /* Default to Latin-1 */
8779 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008780 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008782 /* allocate enough for a simple encoding without
8783 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008784 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785 if (res == NULL)
8786 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008787 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008790 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008791 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008793 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 if (x==enc_EXCEPTION) /* error */
8795 goto onError;
8796 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008797 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008799 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 &res, &respos)) {
8801 goto onError;
8802 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008803 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 else
8805 /* done with this character => adjust input position */
8806 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008809 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008810 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008811 if (_PyBytes_Resize(&res, respos) < 0)
8812 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008814 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008815 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816 return res;
8817
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008819 Py_XDECREF(res);
8820 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008821 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822 return NULL;
8823}
8824
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008825/* Deprecated */
8826PyObject *
8827PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8828 Py_ssize_t size,
8829 PyObject *mapping,
8830 const char *errors)
8831{
8832 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008833 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008834 if (unicode == NULL)
8835 return NULL;
8836 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8837 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008838 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008839}
8840
Alexander Belopolsky40018472011-02-26 01:02:56 +00008841PyObject *
8842PyUnicode_AsCharmapString(PyObject *unicode,
8843 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844{
8845 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008846 PyErr_BadArgument();
8847 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008849 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850}
8851
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008852/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008853static void
8854make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008856 Py_ssize_t startpos, Py_ssize_t endpos,
8857 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008859 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860 *exceptionObject = _PyUnicodeTranslateError_Create(
8861 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 }
8863 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8865 goto onError;
8866 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8867 goto onError;
8868 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8869 goto onError;
8870 return;
8871 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008872 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 }
8874}
8875
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008876/* error handling callback helper:
8877 build arguments, call the callback and check the arguments,
8878 put the result into newpos and return the replacement string, which
8879 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008880static PyObject *
8881unicode_translate_call_errorhandler(const char *errors,
8882 PyObject **errorHandler,
8883 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008885 Py_ssize_t startpos, Py_ssize_t endpos,
8886 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008887{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008888 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008889
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008890 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008891 PyObject *restuple;
8892 PyObject *resunicode;
8893
8894 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008896 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008898 }
8899
8900 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008902 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008904
Petr Viktorinffd97532020-02-11 17:46:57 +01008905 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008906 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008908 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008909 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 Py_DECREF(restuple);
8911 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008912 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008913 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 &resunicode, &i_newpos)) {
8915 Py_DECREF(restuple);
8916 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008917 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008918 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008920 else
8921 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008923 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 Py_DECREF(restuple);
8925 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008926 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008927 Py_INCREF(resunicode);
8928 Py_DECREF(restuple);
8929 return resunicode;
8930}
8931
8932/* Lookup the character ch in the mapping and put the result in result,
8933 which must be decrefed by the caller.
8934 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008935static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008937{
Christian Heimes217cfd12007-12-02 14:31:20 +00008938 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008939 PyObject *x;
8940
8941 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008943 x = PyObject_GetItem(mapping, w);
8944 Py_DECREF(w);
8945 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8947 /* No mapping found means: use 1:1 mapping. */
8948 PyErr_Clear();
8949 *result = NULL;
8950 return 0;
8951 } else
8952 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008953 }
8954 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 *result = x;
8956 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008957 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008958 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008960 if (value < 0 || value > MAX_UNICODE) {
8961 PyErr_Format(PyExc_ValueError,
8962 "character mapping must be in range(0x%x)",
8963 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 Py_DECREF(x);
8965 return -1;
8966 }
8967 *result = x;
8968 return 0;
8969 }
8970 else if (PyUnicode_Check(x)) {
8971 *result = x;
8972 return 0;
8973 }
8974 else {
8975 /* wrong return value */
8976 PyErr_SetString(PyExc_TypeError,
8977 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008978 Py_DECREF(x);
8979 return -1;
8980 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008981}
Victor Stinner1194ea02014-04-04 19:37:40 +02008982
8983/* lookup the character, write the result into the writer.
8984 Return 1 if the result was written into the writer, return 0 if the mapping
8985 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008986static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008987charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8988 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008989{
Victor Stinner1194ea02014-04-04 19:37:40 +02008990 PyObject *item;
8991
8992 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008993 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008994
8995 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008997 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009000 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009001 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009002
9003 if (item == Py_None) {
9004 Py_DECREF(item);
9005 return 0;
9006 }
9007
9008 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009009 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9010 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9011 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009012 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9013 Py_DECREF(item);
9014 return -1;
9015 }
9016 Py_DECREF(item);
9017 return 1;
9018 }
9019
9020 if (!PyUnicode_Check(item)) {
9021 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009022 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009023 }
9024
9025 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9026 Py_DECREF(item);
9027 return -1;
9028 }
9029
9030 Py_DECREF(item);
9031 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009032}
9033
Victor Stinner89a76ab2014-04-05 11:44:04 +02009034static int
9035unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9036 Py_UCS1 *translate)
9037{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009038 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009039 int ret = 0;
9040
Victor Stinner89a76ab2014-04-05 11:44:04 +02009041 if (charmaptranslate_lookup(ch, mapping, &item)) {
9042 return -1;
9043 }
9044
9045 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009046 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009047 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009048 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009049 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009050 /* not found => default to 1:1 mapping */
9051 translate[ch] = ch;
9052 return 1;
9053 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009054 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009055 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009056 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9057 used it */
9058 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009059 /* invalid character or character outside ASCII:
9060 skip the fast translate */
9061 goto exit;
9062 }
9063 translate[ch] = (Py_UCS1)replace;
9064 }
9065 else if (PyUnicode_Check(item)) {
9066 Py_UCS4 replace;
9067
9068 if (PyUnicode_READY(item) == -1) {
9069 Py_DECREF(item);
9070 return -1;
9071 }
9072 if (PyUnicode_GET_LENGTH(item) != 1)
9073 goto exit;
9074
9075 replace = PyUnicode_READ_CHAR(item, 0);
9076 if (replace > 127)
9077 goto exit;
9078 translate[ch] = (Py_UCS1)replace;
9079 }
9080 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009081 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009082 goto exit;
9083 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009084 ret = 1;
9085
Benjamin Peterson1365de72014-04-07 20:15:41 -04009086 exit:
9087 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009088 return ret;
9089}
9090
9091/* Fast path for ascii => ascii translation. Return 1 if the whole string
9092 was translated into writer, return 0 if the input string was partially
9093 translated into writer, raise an exception and return -1 on error. */
9094static int
9095unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009096 _PyUnicodeWriter *writer, int ignore,
9097 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009098{
Victor Stinner872b2912014-04-05 14:27:07 +02009099 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009100 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009101 const Py_UCS1 *in, *end;
9102 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009103 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009104
Victor Stinner89a76ab2014-04-05 11:44:04 +02009105 len = PyUnicode_GET_LENGTH(input);
9106
Victor Stinner872b2912014-04-05 14:27:07 +02009107 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009108
9109 in = PyUnicode_1BYTE_DATA(input);
9110 end = in + len;
9111
9112 assert(PyUnicode_IS_ASCII(writer->buffer));
9113 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9114 out = PyUnicode_1BYTE_DATA(writer->buffer);
9115
Victor Stinner872b2912014-04-05 14:27:07 +02009116 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009117 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009118 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009119 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009120 int translate = unicode_fast_translate_lookup(mapping, ch,
9121 ascii_table);
9122 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009123 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009124 if (translate == 0)
9125 goto exit;
9126 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009127 }
Victor Stinner872b2912014-04-05 14:27:07 +02009128 if (ch2 == 0xfe) {
9129 if (ignore)
9130 continue;
9131 goto exit;
9132 }
9133 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009134 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009135 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009136 }
Victor Stinner872b2912014-04-05 14:27:07 +02009137 res = 1;
9138
9139exit:
9140 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009141 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009142 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009143}
9144
Victor Stinner3222da22015-10-01 22:07:32 +02009145static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146_PyUnicode_TranslateCharmap(PyObject *input,
9147 PyObject *mapping,
9148 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009151 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 Py_ssize_t size, i;
9153 int kind;
9154 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009155 _PyUnicodeWriter writer;
9156 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009157 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009158 PyObject *errorHandler = NULL;
9159 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009160 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009161 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009162
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 PyErr_BadArgument();
9165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 if (PyUnicode_READY(input) == -1)
9169 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009170 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 kind = PyUnicode_KIND(input);
9172 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009174 if (size == 0)
9175 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009177 /* allocate enough for a simple 1:1 translation without
9178 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009179 _PyUnicodeWriter_Init(&writer);
9180 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182
Victor Stinner872b2912014-04-05 14:27:07 +02009183 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9184
Victor Stinner33798672016-03-01 21:59:58 +01009185 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009186 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009187 if (PyUnicode_IS_ASCII(input)) {
9188 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9189 if (res < 0) {
9190 _PyUnicodeWriter_Dealloc(&writer);
9191 return NULL;
9192 }
9193 if (res == 1)
9194 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009195 }
Victor Stinner33798672016-03-01 21:59:58 +01009196 else {
9197 i = 0;
9198 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009200 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009202 int translate;
9203 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9204 Py_ssize_t newpos;
9205 /* startpos for collecting untranslatable chars */
9206 Py_ssize_t collstart;
9207 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009208 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009209
Victor Stinner1194ea02014-04-04 19:37:40 +02009210 ch = PyUnicode_READ(kind, data, i);
9211 translate = charmaptranslate_output(ch, mapping, &writer);
9212 if (translate < 0)
9213 goto onError;
9214
9215 if (translate != 0) {
9216 /* it worked => adjust input pointer */
9217 ++i;
9218 continue;
9219 }
9220
9221 /* untranslatable character */
9222 collstart = i;
9223 collend = i+1;
9224
9225 /* find all untranslatable characters */
9226 while (collend < size) {
9227 PyObject *x;
9228 ch = PyUnicode_READ(kind, data, collend);
9229 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009230 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009231 Py_XDECREF(x);
9232 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009233 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009234 ++collend;
9235 }
9236
9237 if (ignore) {
9238 i = collend;
9239 }
9240 else {
9241 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9242 reason, input, &exc,
9243 collstart, collend, &newpos);
9244 if (repunicode == NULL)
9245 goto onError;
9246 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009247 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009248 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009249 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009250 Py_DECREF(repunicode);
9251 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009252 }
9253 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009254 Py_XDECREF(exc);
9255 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009256 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257
Benjamin Peterson29060642009-01-31 22:14:21 +00009258 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009259 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009260 Py_XDECREF(exc);
9261 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262 return NULL;
9263}
9264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265/* Deprecated. Use PyUnicode_Translate instead. */
9266PyObject *
9267PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9268 Py_ssize_t size,
9269 PyObject *mapping,
9270 const char *errors)
9271{
Christian Heimes5f520f42012-09-11 14:03:25 +02009272 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009273 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 if (!unicode)
9275 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009276 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9277 Py_DECREF(unicode);
9278 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279}
9280
Alexander Belopolsky40018472011-02-26 01:02:56 +00009281PyObject *
9282PyUnicode_Translate(PyObject *str,
9283 PyObject *mapping,
9284 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009286 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009287 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009288 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289}
Tim Petersced69f82003-09-16 20:30:58 +00009290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291PyObject *
9292_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9293{
9294 if (!PyUnicode_Check(unicode)) {
9295 PyErr_BadInternalCall();
9296 return NULL;
9297 }
9298 if (PyUnicode_READY(unicode) == -1)
9299 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009300 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 /* If the string is already ASCII, just return the same string */
9302 Py_INCREF(unicode);
9303 return unicode;
9304 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009305
9306 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9307 PyObject *result = PyUnicode_New(len, 127);
9308 if (result == NULL) {
9309 return NULL;
9310 }
9311
9312 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9313 int kind = PyUnicode_KIND(unicode);
9314 const void *data = PyUnicode_DATA(unicode);
9315 Py_ssize_t i;
9316 for (i = 0; i < len; ++i) {
9317 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9318 if (ch < 127) {
9319 out[i] = ch;
9320 }
9321 else if (Py_UNICODE_ISSPACE(ch)) {
9322 out[i] = ' ';
9323 }
9324 else {
9325 int decimal = Py_UNICODE_TODECIMAL(ch);
9326 if (decimal < 0) {
9327 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009328 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009329 _PyUnicode_LENGTH(result) = i + 1;
9330 break;
9331 }
9332 out[i] = '0' + decimal;
9333 }
9334 }
9335
INADA Naoki16dfca42018-07-14 12:06:43 +09009336 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009337 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338}
9339
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009340PyObject *
9341PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9342 Py_ssize_t length)
9343{
Victor Stinnerf0124502011-11-21 23:12:56 +01009344 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009345 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009346 Py_UCS4 maxchar;
9347 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009348 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009349
Victor Stinner99d7ad02012-02-22 13:37:39 +01009350 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009351 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009352 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009353 if (ch > 127) {
9354 int decimal = Py_UNICODE_TODECIMAL(ch);
9355 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009356 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009357 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009358 }
9359 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009360
9361 /* Copy to a new string */
9362 decimal = PyUnicode_New(length, maxchar);
9363 if (decimal == NULL)
9364 return decimal;
9365 kind = PyUnicode_KIND(decimal);
9366 data = PyUnicode_DATA(decimal);
9367 /* Iterate over code points */
9368 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009369 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009370 if (ch > 127) {
9371 int decimal = Py_UNICODE_TODECIMAL(ch);
9372 if (decimal >= 0)
9373 ch = '0' + decimal;
9374 }
9375 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009377 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009378}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009379/* --- Decimal Encoder ---------------------------------------------------- */
9380
Alexander Belopolsky40018472011-02-26 01:02:56 +00009381int
9382PyUnicode_EncodeDecimal(Py_UNICODE *s,
9383 Py_ssize_t length,
9384 char *output,
9385 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009386{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009387 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009388 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009389 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009390 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009391
9392 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009393 PyErr_BadArgument();
9394 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009395 }
9396
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009397 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009398 if (unicode == NULL)
9399 return -1;
9400
Victor Stinner42bf7752011-11-21 22:52:58 +01009401 kind = PyUnicode_KIND(unicode);
9402 data = PyUnicode_DATA(unicode);
9403
Victor Stinnerb84d7232011-11-22 01:50:07 +01009404 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009405 PyObject *exc;
9406 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009407 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009408 Py_ssize_t startpos;
9409
9410 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009411
Benjamin Peterson29060642009-01-31 22:14:21 +00009412 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009413 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009414 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009415 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009416 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 decimal = Py_UNICODE_TODECIMAL(ch);
9418 if (decimal >= 0) {
9419 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009420 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009421 continue;
9422 }
9423 if (0 < ch && ch < 256) {
9424 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009425 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009426 continue;
9427 }
Victor Stinner6345be92011-11-25 20:09:01 +01009428
Victor Stinner42bf7752011-11-21 22:52:58 +01009429 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009430 exc = NULL;
9431 raise_encode_exception(&exc, "decimal", unicode,
9432 startpos, startpos+1,
9433 "invalid decimal Unicode string");
9434 Py_XDECREF(exc);
9435 Py_DECREF(unicode);
9436 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009437 }
9438 /* 0-terminate the output string */
9439 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009440 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009441 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009442}
9443
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444/* --- Helpers ------------------------------------------------------------ */
9445
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009446/* helper macro to fixup start/end slice values */
9447#define ADJUST_INDICES(start, end, len) \
9448 if (end > len) \
9449 end = len; \
9450 else if (end < 0) { \
9451 end += len; \
9452 if (end < 0) \
9453 end = 0; \
9454 } \
9455 if (start < 0) { \
9456 start += len; \
9457 if (start < 0) \
9458 start = 0; \
9459 }
9460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009462any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009464 Py_ssize_t end,
9465 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009467 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009468 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 Py_ssize_t len1, len2, result;
9470
9471 kind1 = PyUnicode_KIND(s1);
9472 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009473 if (kind1 < kind2)
9474 return -1;
9475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 len1 = PyUnicode_GET_LENGTH(s1);
9477 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009478 ADJUST_INDICES(start, end, len1);
9479 if (end - start < len2)
9480 return -1;
9481
9482 buf1 = PyUnicode_DATA(s1);
9483 buf2 = PyUnicode_DATA(s2);
9484 if (len2 == 1) {
9485 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9486 result = findchar((const char *)buf1 + kind1*start,
9487 kind1, end - start, ch, direction);
9488 if (result == -1)
9489 return -1;
9490 else
9491 return start + result;
9492 }
9493
9494 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009495 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009496 if (!buf2)
9497 return -2;
9498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499
Victor Stinner794d5672011-10-10 03:21:36 +02009500 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009501 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009502 case PyUnicode_1BYTE_KIND:
9503 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9504 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9505 else
9506 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9507 break;
9508 case PyUnicode_2BYTE_KIND:
9509 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9510 break;
9511 case PyUnicode_4BYTE_KIND:
9512 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9513 break;
9514 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009515 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009516 }
9517 }
9518 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009519 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009520 case PyUnicode_1BYTE_KIND:
9521 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9522 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9523 else
9524 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9525 break;
9526 case PyUnicode_2BYTE_KIND:
9527 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9528 break;
9529 case PyUnicode_4BYTE_KIND:
9530 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9531 break;
9532 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009533 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 }
9536
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009537 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009538 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009539 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540
9541 return result;
9542}
9543
Victor Stinner59423e32018-11-26 13:40:01 +01009544/* _PyUnicode_InsertThousandsGrouping() helper functions */
9545#include "stringlib/localeutil.h"
9546
9547/**
9548 * InsertThousandsGrouping:
9549 * @writer: Unicode writer.
9550 * @n_buffer: Number of characters in @buffer.
9551 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9552 * @d_pos: Start of digits string.
9553 * @n_digits: The number of digits in the string, in which we want
9554 * to put the grouping chars.
9555 * @min_width: The minimum width of the digits in the output string.
9556 * Output will be zero-padded on the left to fill.
9557 * @grouping: see definition in localeconv().
9558 * @thousands_sep: see definition in localeconv().
9559 *
9560 * There are 2 modes: counting and filling. If @writer is NULL,
9561 * we are in counting mode, else filling mode.
9562 * If counting, the required buffer size is returned.
9563 * If filling, we know the buffer will be large enough, so we don't
9564 * need to pass in the buffer size.
9565 * Inserts thousand grouping characters (as defined by grouping and
9566 * thousands_sep) into @writer.
9567 *
9568 * Return value: -1 on error, number of characters otherwise.
9569 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009571_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009572 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009573 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009574 PyObject *digits,
9575 Py_ssize_t d_pos,
9576 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009577 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009578 const char *grouping,
9579 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009580 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581{
Xtreak3f7983a2019-01-07 20:39:14 +05309582 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009583 if (writer) {
9584 assert(digits != NULL);
9585 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009586 }
9587 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009588 assert(digits == NULL);
9589 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009590 }
Victor Stinner59423e32018-11-26 13:40:01 +01009591 assert(0 <= d_pos);
9592 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009593 assert(grouping != NULL);
9594
9595 if (digits != NULL) {
9596 if (PyUnicode_READY(digits) == -1) {
9597 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009598 }
Victor Stinner59423e32018-11-26 13:40:01 +01009599 }
9600 if (PyUnicode_READY(thousands_sep) == -1) {
9601 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009602 }
9603
Victor Stinner59423e32018-11-26 13:40:01 +01009604 Py_ssize_t count = 0;
9605 Py_ssize_t n_zeros;
9606 int loop_broken = 0;
9607 int use_separator = 0; /* First time through, don't append the
9608 separator. They only go between
9609 groups. */
9610 Py_ssize_t buffer_pos;
9611 Py_ssize_t digits_pos;
9612 Py_ssize_t len;
9613 Py_ssize_t n_chars;
9614 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9615 be looked at */
9616 /* A generator that returns all of the grouping widths, until it
9617 returns 0. */
9618 GroupGenerator groupgen;
9619 GroupGenerator_init(&groupgen, grouping);
9620 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9621
9622 /* if digits are not grouped, thousands separator
9623 should be an empty string */
9624 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9625
9626 digits_pos = d_pos + n_digits;
9627 if (writer) {
9628 buffer_pos = writer->pos + n_buffer;
9629 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9630 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 }
Victor Stinner59423e32018-11-26 13:40:01 +01009632 else {
9633 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009634 }
Victor Stinner59423e32018-11-26 13:40:01 +01009635
9636 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009637 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009638 }
Victor Stinner59423e32018-11-26 13:40:01 +01009639
9640 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9641 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9642 n_zeros = Py_MAX(0, len - remaining);
9643 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9644
9645 /* Use n_zero zero's and n_chars chars */
9646
9647 /* Count only, don't do anything. */
9648 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9649
9650 /* Copy into the writer. */
9651 InsertThousandsGrouping_fill(writer, &buffer_pos,
9652 digits, &digits_pos,
9653 n_chars, n_zeros,
9654 use_separator ? thousands_sep : NULL,
9655 thousands_sep_len, maxchar);
9656
9657 /* Use a separator next time. */
9658 use_separator = 1;
9659
9660 remaining -= n_chars;
9661 min_width -= len;
9662
9663 if (remaining <= 0 && min_width <= 0) {
9664 loop_broken = 1;
9665 break;
9666 }
9667 min_width -= thousands_sep_len;
9668 }
9669 if (!loop_broken) {
9670 /* We left the loop without using a break statement. */
9671
9672 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9673 n_zeros = Py_MAX(0, len - remaining);
9674 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9675
9676 /* Use n_zero zero's and n_chars chars */
9677 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9678
9679 /* Copy into the writer. */
9680 InsertThousandsGrouping_fill(writer, &buffer_pos,
9681 digits, &digits_pos,
9682 n_chars, n_zeros,
9683 use_separator ? thousands_sep : NULL,
9684 thousands_sep_len, maxchar);
9685 }
9686 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687}
9688
9689
Alexander Belopolsky40018472011-02-26 01:02:56 +00009690Py_ssize_t
9691PyUnicode_Count(PyObject *str,
9692 PyObject *substr,
9693 Py_ssize_t start,
9694 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009696 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009697 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009698 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009700
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009701 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009702 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009703
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009704 kind1 = PyUnicode_KIND(str);
9705 kind2 = PyUnicode_KIND(substr);
9706 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009707 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009708
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009709 len1 = PyUnicode_GET_LENGTH(str);
9710 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009712 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009713 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009714
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009715 buf1 = PyUnicode_DATA(str);
9716 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009717 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009718 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009719 if (!buf2)
9720 goto onError;
9721 }
9722
9723 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009725 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009726 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009727 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009728 buf2, len2, PY_SSIZE_T_MAX
9729 );
9730 else
9731 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009732 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009733 buf2, len2, PY_SSIZE_T_MAX
9734 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 break;
9736 case PyUnicode_2BYTE_KIND:
9737 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009738 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 buf2, len2, PY_SSIZE_T_MAX
9740 );
9741 break;
9742 case PyUnicode_4BYTE_KIND:
9743 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009744 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 buf2, len2, PY_SSIZE_T_MAX
9746 );
9747 break;
9748 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009749 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009751
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009752 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009753 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009754 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009758 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9759 if (kind2 != kind1)
9760 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762}
9763
Alexander Belopolsky40018472011-02-26 01:02:56 +00009764Py_ssize_t
9765PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009766 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009767 Py_ssize_t start,
9768 Py_ssize_t end,
9769 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009771 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009772 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009773
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009774 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009775}
9776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777Py_ssize_t
9778PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9779 Py_ssize_t start, Py_ssize_t end,
9780 int direction)
9781{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009783 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 if (PyUnicode_READY(str) == -1)
9785 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009786 len = PyUnicode_GET_LENGTH(str);
9787 ADJUST_INDICES(start, end, len);
9788 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009789 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009791 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9792 kind, end-start, ch, direction);
9793 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009795 else
9796 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797}
9798
Alexander Belopolsky40018472011-02-26 01:02:56 +00009799static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009800tailmatch(PyObject *self,
9801 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009802 Py_ssize_t start,
9803 Py_ssize_t end,
9804 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 int kind_self;
9807 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009808 const void *data_self;
9809 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 Py_ssize_t offset;
9811 Py_ssize_t i;
9812 Py_ssize_t end_sub;
9813
9814 if (PyUnicode_READY(self) == -1 ||
9815 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009816 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9819 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009821 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009823 if (PyUnicode_GET_LENGTH(substring) == 0)
9824 return 1;
9825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 kind_self = PyUnicode_KIND(self);
9827 data_self = PyUnicode_DATA(self);
9828 kind_sub = PyUnicode_KIND(substring);
9829 data_sub = PyUnicode_DATA(substring);
9830 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9831
9832 if (direction > 0)
9833 offset = end;
9834 else
9835 offset = start;
9836
9837 if (PyUnicode_READ(kind_self, data_self, offset) ==
9838 PyUnicode_READ(kind_sub, data_sub, 0) &&
9839 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9840 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9841 /* If both are of the same kind, memcmp is sufficient */
9842 if (kind_self == kind_sub) {
9843 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009844 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 data_sub,
9846 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009847 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009849 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 else {
9851 /* We do not need to compare 0 and len(substring)-1 because
9852 the if statement above ensured already that they are equal
9853 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 for (i = 1; i < end_sub; ++i) {
9855 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9856 PyUnicode_READ(kind_sub, data_sub, i))
9857 return 0;
9858 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009859 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861 }
9862
9863 return 0;
9864}
9865
Alexander Belopolsky40018472011-02-26 01:02:56 +00009866Py_ssize_t
9867PyUnicode_Tailmatch(PyObject *str,
9868 PyObject *substr,
9869 Py_ssize_t start,
9870 Py_ssize_t end,
9871 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009873 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009874 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009875
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009876 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877}
9878
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009879static PyObject *
9880ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009882 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009883 const char *data = PyUnicode_DATA(self);
9884 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009885 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009886
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009887 res = PyUnicode_New(len, 127);
9888 if (res == NULL)
9889 return NULL;
9890 resdata = PyUnicode_DATA(res);
9891 if (lower)
9892 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009894 _Py_bytes_upper(resdata, data, len);
9895 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896}
9897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009899handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009901 Py_ssize_t j;
9902 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009903 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009904 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009905
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009906 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9907
9908 where ! is a negation and \p{xxx} is a character with property xxx.
9909 */
9910 for (j = i - 1; j >= 0; j--) {
9911 c = PyUnicode_READ(kind, data, j);
9912 if (!_PyUnicode_IsCaseIgnorable(c))
9913 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009915 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9916 if (final_sigma) {
9917 for (j = i + 1; j < length; j++) {
9918 c = PyUnicode_READ(kind, data, j);
9919 if (!_PyUnicode_IsCaseIgnorable(c))
9920 break;
9921 }
9922 final_sigma = j == length || !_PyUnicode_IsCased(c);
9923 }
9924 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925}
9926
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009927static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009928lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009929 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009931 /* Obscure special case. */
9932 if (c == 0x3A3) {
9933 mapped[0] = handle_capital_sigma(kind, data, length, i);
9934 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009936 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937}
9938
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009939static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009940do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009942 Py_ssize_t i, k = 0;
9943 int n_res, j;
9944 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009945
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009946 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009947 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009948 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009949 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009950 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009952 for (i = 1; i < length; i++) {
9953 c = PyUnicode_READ(kind, data, i);
9954 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9955 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009956 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009957 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009958 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009959 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009960 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961}
9962
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009963static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009964do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009965 Py_ssize_t i, k = 0;
9966
9967 for (i = 0; i < length; i++) {
9968 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9969 int n_res, j;
9970 if (Py_UNICODE_ISUPPER(c)) {
9971 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9972 }
9973 else if (Py_UNICODE_ISLOWER(c)) {
9974 n_res = _PyUnicode_ToUpperFull(c, mapped);
9975 }
9976 else {
9977 n_res = 1;
9978 mapped[0] = c;
9979 }
9980 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009981 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009982 res[k++] = mapped[j];
9983 }
9984 }
9985 return k;
9986}
9987
9988static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009989do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009990 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009992 Py_ssize_t i, k = 0;
9993
9994 for (i = 0; i < length; i++) {
9995 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9996 int n_res, j;
9997 if (lower)
9998 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9999 else
10000 n_res = _PyUnicode_ToUpperFull(c, mapped);
10001 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010002 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010003 res[k++] = mapped[j];
10004 }
10005 }
10006 return k;
10007}
10008
10009static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010010do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010011{
10012 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10013}
10014
10015static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010016do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010017{
10018 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10019}
10020
Benjamin Petersone51757f2012-01-12 21:10:29 -050010021static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010022do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010023{
10024 Py_ssize_t i, k = 0;
10025
10026 for (i = 0; i < length; i++) {
10027 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10028 Py_UCS4 mapped[3];
10029 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10030 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010031 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010032 res[k++] = mapped[j];
10033 }
10034 }
10035 return k;
10036}
10037
10038static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010039do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010040{
10041 Py_ssize_t i, k = 0;
10042 int previous_is_cased;
10043
10044 previous_is_cased = 0;
10045 for (i = 0; i < length; i++) {
10046 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10047 Py_UCS4 mapped[3];
10048 int n_res, j;
10049
10050 if (previous_is_cased)
10051 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10052 else
10053 n_res = _PyUnicode_ToTitleFull(c, mapped);
10054
10055 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010056 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010057 res[k++] = mapped[j];
10058 }
10059
10060 previous_is_cased = _PyUnicode_IsCased(c);
10061 }
10062 return k;
10063}
10064
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010065static PyObject *
10066case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010067 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010068{
10069 PyObject *res = NULL;
10070 Py_ssize_t length, newlength = 0;
10071 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010072 const void *data;
10073 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010074 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10075
Benjamin Petersoneea48462012-01-16 14:28:50 -050010076 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010077
10078 kind = PyUnicode_KIND(self);
10079 data = PyUnicode_DATA(self);
10080 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010081 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010082 PyErr_SetString(PyExc_OverflowError, "string is too long");
10083 return NULL;
10084 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010085 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010086 if (tmp == NULL)
10087 return PyErr_NoMemory();
10088 newlength = perform(kind, data, length, tmp, &maxchar);
10089 res = PyUnicode_New(newlength, maxchar);
10090 if (res == NULL)
10091 goto leave;
10092 tmpend = tmp + newlength;
10093 outdata = PyUnicode_DATA(res);
10094 outkind = PyUnicode_KIND(res);
10095 switch (outkind) {
10096 case PyUnicode_1BYTE_KIND:
10097 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10098 break;
10099 case PyUnicode_2BYTE_KIND:
10100 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10101 break;
10102 case PyUnicode_4BYTE_KIND:
10103 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10104 break;
10105 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010106 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010107 }
10108 leave:
10109 PyMem_FREE(tmp);
10110 return res;
10111}
10112
Tim Peters8ce9f162004-08-27 01:49:32 +000010113PyObject *
10114PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010116 PyObject *res;
10117 PyObject *fseq;
10118 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010119 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010120
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010121 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010122 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010123 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010124 }
10125
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010126 /* NOTE: the following code can't call back into Python code,
10127 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010128 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010129
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010130 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010131 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010132 res = _PyUnicode_JoinArray(separator, items, seqlen);
10133 Py_DECREF(fseq);
10134 return res;
10135}
10136
10137PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010138_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010139{
10140 PyObject *res = NULL; /* the result */
10141 PyObject *sep = NULL;
10142 Py_ssize_t seplen;
10143 PyObject *item;
10144 Py_ssize_t sz, i, res_offset;
10145 Py_UCS4 maxchar;
10146 Py_UCS4 item_maxchar;
10147 int use_memcpy;
10148 unsigned char *res_data = NULL, *sep_data = NULL;
10149 PyObject *last_obj;
10150 unsigned int kind = 0;
10151
Tim Peters05eba1f2004-08-27 21:32:02 +000010152 /* If empty sequence, return u"". */
10153 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010154 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010155 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010156
Tim Peters05eba1f2004-08-27 21:32:02 +000010157 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010158 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010159 if (seqlen == 1) {
10160 if (PyUnicode_CheckExact(items[0])) {
10161 res = items[0];
10162 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010163 return res;
10164 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010165 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010166 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010167 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010168 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010169 /* Set up sep and seplen */
10170 if (separator == NULL) {
10171 /* fall back to a blank space separator */
10172 sep = PyUnicode_FromOrdinal(' ');
10173 if (!sep)
10174 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010175 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010176 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010177 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010178 else {
10179 if (!PyUnicode_Check(separator)) {
10180 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010181 "separator: expected str instance,"
10182 " %.80s found",
10183 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010184 goto onError;
10185 }
10186 if (PyUnicode_READY(separator))
10187 goto onError;
10188 sep = separator;
10189 seplen = PyUnicode_GET_LENGTH(separator);
10190 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10191 /* inc refcount to keep this code path symmetric with the
10192 above case of a blank separator */
10193 Py_INCREF(sep);
10194 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010195 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010196 }
10197
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010198 /* There are at least two things to join, or else we have a subclass
10199 * of str in the sequence.
10200 * Do a pre-pass to figure out the total amount of space we'll
10201 * need (sz), and see whether all argument are strings.
10202 */
10203 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010204#ifdef Py_DEBUG
10205 use_memcpy = 0;
10206#else
10207 use_memcpy = 1;
10208#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010209 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010210 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010211 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010212 if (!PyUnicode_Check(item)) {
10213 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010214 "sequence item %zd: expected str instance,"
10215 " %.80s found",
10216 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010217 goto onError;
10218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 if (PyUnicode_READY(item) == -1)
10220 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010221 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010223 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010224 if (i != 0) {
10225 add_sz += seplen;
10226 }
10227 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010228 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010229 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010230 goto onError;
10231 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010232 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010233 if (use_memcpy && last_obj != NULL) {
10234 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10235 use_memcpy = 0;
10236 }
10237 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010238 }
Tim Petersced69f82003-09-16 20:30:58 +000010239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010241 if (res == NULL)
10242 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010243
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010244 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010245#ifdef Py_DEBUG
10246 use_memcpy = 0;
10247#else
10248 if (use_memcpy) {
10249 res_data = PyUnicode_1BYTE_DATA(res);
10250 kind = PyUnicode_KIND(res);
10251 if (seplen != 0)
10252 sep_data = PyUnicode_1BYTE_DATA(sep);
10253 }
10254#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010255 if (use_memcpy) {
10256 for (i = 0; i < seqlen; ++i) {
10257 Py_ssize_t itemlen;
10258 item = items[i];
10259
10260 /* Copy item, and maybe the separator. */
10261 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010262 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010263 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010264 kind * seplen);
10265 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010266 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010267
10268 itemlen = PyUnicode_GET_LENGTH(item);
10269 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010270 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010271 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010272 kind * itemlen);
10273 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010274 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010275 }
10276 assert(res_data == PyUnicode_1BYTE_DATA(res)
10277 + kind * PyUnicode_GET_LENGTH(res));
10278 }
10279 else {
10280 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10281 Py_ssize_t itemlen;
10282 item = items[i];
10283
10284 /* Copy item, and maybe the separator. */
10285 if (i && seplen != 0) {
10286 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10287 res_offset += seplen;
10288 }
10289
10290 itemlen = PyUnicode_GET_LENGTH(item);
10291 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010292 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010293 res_offset += itemlen;
10294 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010295 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010296 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010297 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010300 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302
Benjamin Peterson29060642009-01-31 22:14:21 +000010303 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010305 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306 return NULL;
10307}
10308
Victor Stinnerd3f08822012-05-29 12:57:52 +020010309void
10310_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10311 Py_UCS4 fill_char)
10312{
10313 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010314 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010315 assert(PyUnicode_IS_READY(unicode));
10316 assert(unicode_modifiable(unicode));
10317 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10318 assert(start >= 0);
10319 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010320 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010321}
10322
Victor Stinner3fe55312012-01-04 00:33:50 +010010323Py_ssize_t
10324PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10325 Py_UCS4 fill_char)
10326{
10327 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010328
10329 if (!PyUnicode_Check(unicode)) {
10330 PyErr_BadInternalCall();
10331 return -1;
10332 }
10333 if (PyUnicode_READY(unicode) == -1)
10334 return -1;
10335 if (unicode_check_modifiable(unicode))
10336 return -1;
10337
Victor Stinnerd3f08822012-05-29 12:57:52 +020010338 if (start < 0) {
10339 PyErr_SetString(PyExc_IndexError, "string index out of range");
10340 return -1;
10341 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010342 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10343 PyErr_SetString(PyExc_ValueError,
10344 "fill character is bigger than "
10345 "the string maximum character");
10346 return -1;
10347 }
10348
10349 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10350 length = Py_MIN(maxlen, length);
10351 if (length <= 0)
10352 return 0;
10353
Victor Stinnerd3f08822012-05-29 12:57:52 +020010354 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010355 return length;
10356}
10357
Victor Stinner9310abb2011-10-05 00:59:23 +020010358static PyObject *
10359pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010360 Py_ssize_t left,
10361 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 PyObject *u;
10365 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010366 int kind;
10367 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368
10369 if (left < 0)
10370 left = 0;
10371 if (right < 0)
10372 right = 0;
10373
Victor Stinnerc4b49542011-12-11 22:44:26 +010010374 if (left == 0 && right == 0)
10375 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10378 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010379 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10380 return NULL;
10381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010383 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010385 if (!u)
10386 return NULL;
10387
10388 kind = PyUnicode_KIND(u);
10389 data = PyUnicode_DATA(u);
10390 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010391 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010392 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010393 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010394 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010395 assert(_PyUnicode_CheckConsistency(u, 1));
10396 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397}
10398
Alexander Belopolsky40018472011-02-26 01:02:56 +000010399PyObject *
10400PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010404 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406
Benjamin Petersonead6b532011-12-20 17:23:42 -060010407 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010409 if (PyUnicode_IS_ASCII(string))
10410 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010411 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010412 PyUnicode_GET_LENGTH(string), keepends);
10413 else
10414 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010415 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010416 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 break;
10418 case PyUnicode_2BYTE_KIND:
10419 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010420 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 PyUnicode_GET_LENGTH(string), keepends);
10422 break;
10423 case PyUnicode_4BYTE_KIND:
10424 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010425 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 PyUnicode_GET_LENGTH(string), keepends);
10427 break;
10428 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010429 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432}
10433
Alexander Belopolsky40018472011-02-26 01:02:56 +000010434static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010435split(PyObject *self,
10436 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010437 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010439 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010440 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 Py_ssize_t len1, len2;
10442 PyObject* out;
10443
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010445 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 if (PyUnicode_READY(self) == -1)
10448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010451 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010453 if (PyUnicode_IS_ASCII(self))
10454 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010455 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010456 PyUnicode_GET_LENGTH(self), maxcount
10457 );
10458 else
10459 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010460 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010461 PyUnicode_GET_LENGTH(self), maxcount
10462 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 case PyUnicode_2BYTE_KIND:
10464 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010465 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 PyUnicode_GET_LENGTH(self), maxcount
10467 );
10468 case PyUnicode_4BYTE_KIND:
10469 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010470 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 PyUnicode_GET_LENGTH(self), maxcount
10472 );
10473 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010474 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 }
10476
10477 if (PyUnicode_READY(substring) == -1)
10478 return NULL;
10479
10480 kind1 = PyUnicode_KIND(self);
10481 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 len1 = PyUnicode_GET_LENGTH(self);
10483 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010484 if (kind1 < kind2 || len1 < len2) {
10485 out = PyList_New(1);
10486 if (out == NULL)
10487 return NULL;
10488 Py_INCREF(self);
10489 PyList_SET_ITEM(out, 0, self);
10490 return out;
10491 }
10492 buf1 = PyUnicode_DATA(self);
10493 buf2 = PyUnicode_DATA(substring);
10494 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010495 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010496 if (!buf2)
10497 return NULL;
10498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010500 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010502 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10503 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010504 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010505 else
10506 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010507 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 break;
10509 case PyUnicode_2BYTE_KIND:
10510 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010511 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 break;
10513 case PyUnicode_4BYTE_KIND:
10514 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010515 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 break;
10517 default:
10518 out = NULL;
10519 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010520 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010521 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010522 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524}
10525
Alexander Belopolsky40018472011-02-26 01:02:56 +000010526static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010527rsplit(PyObject *self,
10528 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010529 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010530{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010531 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010532 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 Py_ssize_t len1, len2;
10534 PyObject* out;
10535
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010536 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010537 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 if (PyUnicode_READY(self) == -1)
10540 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010543 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010545 if (PyUnicode_IS_ASCII(self))
10546 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010547 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010548 PyUnicode_GET_LENGTH(self), maxcount
10549 );
10550 else
10551 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010552 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010553 PyUnicode_GET_LENGTH(self), maxcount
10554 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 case PyUnicode_2BYTE_KIND:
10556 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010557 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 PyUnicode_GET_LENGTH(self), maxcount
10559 );
10560 case PyUnicode_4BYTE_KIND:
10561 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010562 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 PyUnicode_GET_LENGTH(self), maxcount
10564 );
10565 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010566 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 }
10568
10569 if (PyUnicode_READY(substring) == -1)
10570 return NULL;
10571
10572 kind1 = PyUnicode_KIND(self);
10573 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 len1 = PyUnicode_GET_LENGTH(self);
10575 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010576 if (kind1 < kind2 || len1 < len2) {
10577 out = PyList_New(1);
10578 if (out == NULL)
10579 return NULL;
10580 Py_INCREF(self);
10581 PyList_SET_ITEM(out, 0, self);
10582 return out;
10583 }
10584 buf1 = PyUnicode_DATA(self);
10585 buf2 = PyUnicode_DATA(substring);
10586 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010587 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010588 if (!buf2)
10589 return NULL;
10590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010592 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010594 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10595 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010596 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010597 else
10598 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010599 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 break;
10601 case PyUnicode_2BYTE_KIND:
10602 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010603 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 break;
10605 case PyUnicode_4BYTE_KIND:
10606 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010607 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 break;
10609 default:
10610 out = NULL;
10611 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010612 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010613 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010614 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 return out;
10616}
10617
10618static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010619anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10620 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010622 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010624 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10625 return asciilib_find(buf1, len1, buf2, len2, offset);
10626 else
10627 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 case PyUnicode_2BYTE_KIND:
10629 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10630 case PyUnicode_4BYTE_KIND:
10631 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10632 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010633 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634}
10635
10636static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010637anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10638 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010640 switch (kind) {
10641 case PyUnicode_1BYTE_KIND:
10642 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10643 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10644 else
10645 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10646 case PyUnicode_2BYTE_KIND:
10647 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10648 case PyUnicode_4BYTE_KIND:
10649 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10650 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010651 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010652}
10653
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010654static void
10655replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10656 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10657{
10658 int kind = PyUnicode_KIND(u);
10659 void *data = PyUnicode_DATA(u);
10660 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10661 if (kind == PyUnicode_1BYTE_KIND) {
10662 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10663 (Py_UCS1 *)data + len,
10664 u1, u2, maxcount);
10665 }
10666 else if (kind == PyUnicode_2BYTE_KIND) {
10667 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10668 (Py_UCS2 *)data + len,
10669 u1, u2, maxcount);
10670 }
10671 else {
10672 assert(kind == PyUnicode_4BYTE_KIND);
10673 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10674 (Py_UCS4 *)data + len,
10675 u1, u2, maxcount);
10676 }
10677}
10678
Alexander Belopolsky40018472011-02-26 01:02:56 +000010679static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680replace(PyObject *self, PyObject *str1,
10681 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010684 const char *sbuf = PyUnicode_DATA(self);
10685 const void *buf1 = PyUnicode_DATA(str1);
10686 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 int srelease = 0, release1 = 0, release2 = 0;
10688 int skind = PyUnicode_KIND(self);
10689 int kind1 = PyUnicode_KIND(str1);
10690 int kind2 = PyUnicode_KIND(str2);
10691 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10692 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10693 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010694 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010695 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010697 if (slen < len1)
10698 goto nothing;
10699
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010701 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010702 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010703 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704
Victor Stinner59de0ee2011-10-07 10:01:28 +020010705 if (str1 == str2)
10706 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707
Victor Stinner49a0a212011-10-12 23:46:10 +020010708 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010709 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10710 if (maxchar < maxchar_str1)
10711 /* substring too wide to be present */
10712 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010713 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10714 /* Replacing str1 with str2 may cause a maxchar reduction in the
10715 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010716 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010717 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010720 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010722 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010725 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010726 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010727
Victor Stinner69ed0f42013-04-09 21:48:24 +020010728 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010729 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010730 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010731 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010732 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010734 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010736
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010737 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10738 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010739 }
10740 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 int rkind = skind;
10742 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010743 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 if (kind1 < rkind) {
10746 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010747 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 if (!buf1) goto error;
10749 release1 = 1;
10750 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010751 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010752 if (i < 0)
10753 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 if (rkind > kind2) {
10755 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010756 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 if (!buf2) goto error;
10758 release2 = 1;
10759 }
10760 else if (rkind < kind2) {
10761 /* widen self and buf1 */
10762 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010763 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010764 assert(buf1 != PyUnicode_DATA(str1));
10765 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010766 buf1 = PyUnicode_DATA(str1);
10767 release1 = 0;
10768 }
10769 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 if (!sbuf) goto error;
10771 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010772 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 if (!buf1) goto error;
10774 release1 = 1;
10775 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010776 u = PyUnicode_New(slen, maxchar);
10777 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010779 assert(PyUnicode_KIND(u) == rkind);
10780 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010781
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010782 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010783 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010784 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010786 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010788
10789 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010790 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010791 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010792 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010793 if (i == -1)
10794 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010795 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010797 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010801 }
10802 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010804 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 int rkind = skind;
10806 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010809 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010810 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 if (!buf1) goto error;
10812 release1 = 1;
10813 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010814 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010815 if (n == 0)
10816 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010818 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010819 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 if (!buf2) goto error;
10821 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010824 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010826 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 if (!sbuf) goto error;
10828 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010829 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010830 assert(buf1 != PyUnicode_DATA(str1));
10831 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010832 buf1 = PyUnicode_DATA(str1);
10833 release1 = 0;
10834 }
10835 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 if (!buf1) goto error;
10837 release1 = 1;
10838 }
10839 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10840 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010841 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 PyErr_SetString(PyExc_OverflowError,
10843 "replace string is too long");
10844 goto error;
10845 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010846 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010847 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010848 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010849 goto done;
10850 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010851 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 PyErr_SetString(PyExc_OverflowError,
10853 "replace string is too long");
10854 goto error;
10855 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010856 u = PyUnicode_New(new_size, maxchar);
10857 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010859 assert(PyUnicode_KIND(u) == rkind);
10860 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 ires = i = 0;
10862 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010863 while (n-- > 0) {
10864 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010865 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010866 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010867 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010868 if (j == -1)
10869 break;
10870 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010871 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010872 memcpy(res + rkind * ires,
10873 sbuf + rkind * i,
10874 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010876 }
10877 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010879 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010881 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010882 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010883 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010887 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010888 memcpy(res + rkind * ires,
10889 sbuf + rkind * i,
10890 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010891 }
10892 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010893 /* interleave */
10894 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010895 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010897 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010899 if (--n <= 0)
10900 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010901 memcpy(res + rkind * ires,
10902 sbuf + rkind * i,
10903 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 ires++;
10905 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010906 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010907 memcpy(res + rkind * ires,
10908 sbuf + rkind * i,
10909 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010910 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010911 }
10912
10913 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010914 unicode_adjust_maxchar(&u);
10915 if (u == NULL)
10916 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010918
10919 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010920 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10921 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10922 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010924 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010926 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010928 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010929 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010931
Benjamin Peterson29060642009-01-31 22:14:21 +000010932 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010933 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010934 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10935 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10936 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010938 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010940 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010942 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010943 return unicode_result_unchanged(self);
10944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010946 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10947 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10948 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10949 if (srelease)
10950 PyMem_FREE((void *)sbuf);
10951 if (release1)
10952 PyMem_FREE((void *)buf1);
10953 if (release2)
10954 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956}
10957
10958/* --- Unicode Object Methods --------------------------------------------- */
10959
INADA Naoki3ae20562017-01-16 20:41:20 +090010960/*[clinic input]
10961str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
INADA Naoki3ae20562017-01-16 20:41:20 +090010963Return a version of the string where each word is titlecased.
10964
10965More specifically, words start with uppercased characters and all remaining
10966cased characters have lower case.
10967[clinic start generated code]*/
10968
10969static PyObject *
10970unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010971/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010973 if (PyUnicode_READY(self) == -1)
10974 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010975 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976}
10977
INADA Naoki3ae20562017-01-16 20:41:20 +090010978/*[clinic input]
10979str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980
INADA Naoki3ae20562017-01-16 20:41:20 +090010981Return a capitalized version of the string.
10982
10983More specifically, make the first character have upper case and the rest lower
10984case.
10985[clinic start generated code]*/
10986
10987static PyObject *
10988unicode_capitalize_impl(PyObject *self)
10989/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010991 if (PyUnicode_READY(self) == -1)
10992 return NULL;
10993 if (PyUnicode_GET_LENGTH(self) == 0)
10994 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010995 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996}
10997
INADA Naoki3ae20562017-01-16 20:41:20 +090010998/*[clinic input]
10999str.casefold as unicode_casefold
11000
11001Return a version of the string suitable for caseless comparisons.
11002[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011003
11004static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011005unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011006/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011007{
11008 if (PyUnicode_READY(self) == -1)
11009 return NULL;
11010 if (PyUnicode_IS_ASCII(self))
11011 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011012 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011013}
11014
11015
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011016/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011017
11018static int
11019convert_uc(PyObject *obj, void *addr)
11020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011022
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011023 if (!PyUnicode_Check(obj)) {
11024 PyErr_Format(PyExc_TypeError,
11025 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011026 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011027 return 0;
11028 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011029 if (PyUnicode_READY(obj) < 0)
11030 return 0;
11031 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011032 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011033 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011034 return 0;
11035 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011036 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011037 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011038}
11039
INADA Naoki3ae20562017-01-16 20:41:20 +090011040/*[clinic input]
11041str.center as unicode_center
11042
11043 width: Py_ssize_t
11044 fillchar: Py_UCS4 = ' '
11045 /
11046
11047Return a centered string of length width.
11048
11049Padding is done using the specified fill character (default is a space).
11050[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051
11052static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011053unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11054/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011056 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057
Benjamin Petersonbac79492012-01-14 13:34:47 -050011058 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059 return NULL;
11060
Victor Stinnerc4b49542011-12-11 22:44:26 +010011061 if (PyUnicode_GET_LENGTH(self) >= width)
11062 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063
Victor Stinnerc4b49542011-12-11 22:44:26 +010011064 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065 left = marg / 2 + (marg & width & 1);
11066
Victor Stinner9310abb2011-10-05 00:59:23 +020011067 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068}
11069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070/* This function assumes that str1 and str2 are readied by the caller. */
11071
Marc-André Lemburge5034372000-08-08 08:04:29 +000011072static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011073unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011074{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011075#define COMPARE(TYPE1, TYPE2) \
11076 do { \
11077 TYPE1* p1 = (TYPE1 *)data1; \
11078 TYPE2* p2 = (TYPE2 *)data2; \
11079 TYPE1* end = p1 + len; \
11080 Py_UCS4 c1, c2; \
11081 for (; p1 != end; p1++, p2++) { \
11082 c1 = *p1; \
11083 c2 = *p2; \
11084 if (c1 != c2) \
11085 return (c1 < c2) ? -1 : 1; \
11086 } \
11087 } \
11088 while (0)
11089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011091 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011092 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 kind1 = PyUnicode_KIND(str1);
11095 kind2 = PyUnicode_KIND(str2);
11096 data1 = PyUnicode_DATA(str1);
11097 data2 = PyUnicode_DATA(str2);
11098 len1 = PyUnicode_GET_LENGTH(str1);
11099 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011100 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011101
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011102 switch(kind1) {
11103 case PyUnicode_1BYTE_KIND:
11104 {
11105 switch(kind2) {
11106 case PyUnicode_1BYTE_KIND:
11107 {
11108 int cmp = memcmp(data1, data2, len);
11109 /* normalize result of memcmp() into the range [-1; 1] */
11110 if (cmp < 0)
11111 return -1;
11112 if (cmp > 0)
11113 return 1;
11114 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011115 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011116 case PyUnicode_2BYTE_KIND:
11117 COMPARE(Py_UCS1, Py_UCS2);
11118 break;
11119 case PyUnicode_4BYTE_KIND:
11120 COMPARE(Py_UCS1, Py_UCS4);
11121 break;
11122 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011123 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011124 }
11125 break;
11126 }
11127 case PyUnicode_2BYTE_KIND:
11128 {
11129 switch(kind2) {
11130 case PyUnicode_1BYTE_KIND:
11131 COMPARE(Py_UCS2, Py_UCS1);
11132 break;
11133 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011134 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011135 COMPARE(Py_UCS2, Py_UCS2);
11136 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011137 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011138 case PyUnicode_4BYTE_KIND:
11139 COMPARE(Py_UCS2, Py_UCS4);
11140 break;
11141 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011142 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011143 }
11144 break;
11145 }
11146 case PyUnicode_4BYTE_KIND:
11147 {
11148 switch(kind2) {
11149 case PyUnicode_1BYTE_KIND:
11150 COMPARE(Py_UCS4, Py_UCS1);
11151 break;
11152 case PyUnicode_2BYTE_KIND:
11153 COMPARE(Py_UCS4, Py_UCS2);
11154 break;
11155 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011156 {
11157#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11158 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11159 /* normalize result of wmemcmp() into the range [-1; 1] */
11160 if (cmp < 0)
11161 return -1;
11162 if (cmp > 0)
11163 return 1;
11164#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011165 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011166#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011167 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011168 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011169 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011170 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011171 }
11172 break;
11173 }
11174 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011175 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011176 }
11177
Victor Stinner770e19e2012-10-04 22:59:45 +020011178 if (len1 == len2)
11179 return 0;
11180 if (len1 < len2)
11181 return -1;
11182 else
11183 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011184
11185#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011186}
11187
Benjamin Peterson621b4302016-09-09 13:54:34 -070011188static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011189unicode_compare_eq(PyObject *str1, PyObject *str2)
11190{
11191 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011192 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011193 Py_ssize_t len;
11194 int cmp;
11195
Victor Stinnere5567ad2012-10-23 02:48:49 +020011196 len = PyUnicode_GET_LENGTH(str1);
11197 if (PyUnicode_GET_LENGTH(str2) != len)
11198 return 0;
11199 kind = PyUnicode_KIND(str1);
11200 if (PyUnicode_KIND(str2) != kind)
11201 return 0;
11202 data1 = PyUnicode_DATA(str1);
11203 data2 = PyUnicode_DATA(str2);
11204
11205 cmp = memcmp(data1, data2, len * kind);
11206 return (cmp == 0);
11207}
11208
11209
Alexander Belopolsky40018472011-02-26 01:02:56 +000011210int
11211PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11214 if (PyUnicode_READY(left) == -1 ||
11215 PyUnicode_READY(right) == -1)
11216 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011217
11218 /* a string is equal to itself */
11219 if (left == right)
11220 return 0;
11221
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011222 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011223 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011224 PyErr_Format(PyExc_TypeError,
11225 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011226 Py_TYPE(left)->tp_name,
11227 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 return -1;
11229}
11230
Martin v. Löwis5b222132007-06-10 09:51:05 +000011231int
11232PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 Py_ssize_t i;
11235 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011237 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238
Victor Stinner910337b2011-10-03 03:20:16 +020011239 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011240 if (!PyUnicode_IS_READY(uni)) {
11241 const wchar_t *ws = _PyUnicode_WSTR(uni);
11242 /* Compare Unicode string and source character set string */
11243 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11244 if (chr != ustr[i])
11245 return (chr < ustr[i]) ? -1 : 1;
11246 }
11247 /* This check keeps Python strings that end in '\0' from comparing equal
11248 to C strings identical up to that point. */
11249 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11250 return 1; /* uni is longer */
11251 if (ustr[i])
11252 return -1; /* str is longer */
11253 return 0;
11254 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011256 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011257 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011258 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011259 size_t len, len2 = strlen(str);
11260 int cmp;
11261
11262 len = Py_MIN(len1, len2);
11263 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011264 if (cmp != 0) {
11265 if (cmp < 0)
11266 return -1;
11267 else
11268 return 1;
11269 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011270 if (len1 > len2)
11271 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011272 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011273 return -1; /* str is longer */
11274 return 0;
11275 }
11276 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011277 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011278 /* Compare Unicode string and source character set string */
11279 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011280 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011281 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11282 /* This check keeps Python strings that end in '\0' from comparing equal
11283 to C strings identical up to that point. */
11284 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11285 return 1; /* uni is longer */
11286 if (str[i])
11287 return -1; /* str is longer */
11288 return 0;
11289 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011290}
11291
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011292static int
11293non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11294{
11295 size_t i, len;
11296 const wchar_t *p;
11297 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11298 if (strlen(str) != len)
11299 return 0;
11300 p = _PyUnicode_WSTR(unicode);
11301 assert(p);
11302 for (i = 0; i < len; i++) {
11303 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011304 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011305 return 0;
11306 }
11307 return 1;
11308}
11309
11310int
11311_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11312{
11313 size_t len;
11314 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011315 assert(str);
11316#ifndef NDEBUG
11317 for (const char *p = str; *p; p++) {
11318 assert((unsigned char)*p < 128);
11319 }
11320#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011321 if (PyUnicode_READY(unicode) == -1) {
11322 /* Memory error or bad data */
11323 PyErr_Clear();
11324 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11325 }
11326 if (!PyUnicode_IS_ASCII(unicode))
11327 return 0;
11328 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11329 return strlen(str) == len &&
11330 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11331}
11332
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011333int
11334_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11335{
11336 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011337
11338 assert(_PyUnicode_CHECK(left));
11339 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011340#ifndef NDEBUG
11341 for (const char *p = right->string; *p; p++) {
11342 assert((unsigned char)*p < 128);
11343 }
11344#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011345
11346 if (PyUnicode_READY(left) == -1) {
11347 /* memory error or bad data */
11348 PyErr_Clear();
11349 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11350 }
11351
11352 if (!PyUnicode_IS_ASCII(left))
11353 return 0;
11354
11355 right_uni = _PyUnicode_FromId(right); /* borrowed */
11356 if (right_uni == NULL) {
11357 /* memory error or bad data */
11358 PyErr_Clear();
11359 return _PyUnicode_EqualToASCIIString(left, right->string);
11360 }
11361
11362 if (left == right_uni)
11363 return 1;
11364
11365 if (PyUnicode_CHECK_INTERNED(left))
11366 return 0;
11367
Victor Stinner607b1022020-05-05 18:50:30 +020011368#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011369 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011370 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011371 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11372 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011373#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011374
11375 return unicode_compare_eq(left, right_uni);
11376}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011377
Alexander Belopolsky40018472011-02-26 01:02:56 +000011378PyObject *
11379PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011380{
11381 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011382
Victor Stinnere5567ad2012-10-23 02:48:49 +020011383 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11384 Py_RETURN_NOTIMPLEMENTED;
11385
11386 if (PyUnicode_READY(left) == -1 ||
11387 PyUnicode_READY(right) == -1)
11388 return NULL;
11389
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011390 if (left == right) {
11391 switch (op) {
11392 case Py_EQ:
11393 case Py_LE:
11394 case Py_GE:
11395 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011396 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011397 case Py_NE:
11398 case Py_LT:
11399 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011400 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011401 default:
11402 PyErr_BadArgument();
11403 return NULL;
11404 }
11405 }
11406 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011407 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011408 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011409 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011410 }
11411 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011412 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011413 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011414 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011415}
11416
Alexander Belopolsky40018472011-02-26 01:02:56 +000011417int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011418_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11419{
11420 return unicode_eq(aa, bb);
11421}
11422
11423int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011424PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011425{
Victor Stinner77282cb2013-04-14 19:22:47 +020011426 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011427 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011429 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011430
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011431 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011432 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011433 "'in <string>' requires string as left operand, not %.100s",
11434 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011435 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011436 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011437 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011438 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011439 if (ensure_unicode(str) < 0)
11440 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011443 kind2 = PyUnicode_KIND(substr);
11444 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011445 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011447 len2 = PyUnicode_GET_LENGTH(substr);
11448 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011449 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011450 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011451 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011452 if (len2 == 1) {
11453 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11454 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011455 return result;
11456 }
11457 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011458 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011459 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011460 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462
Victor Stinner77282cb2013-04-14 19:22:47 +020011463 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 case PyUnicode_1BYTE_KIND:
11465 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11466 break;
11467 case PyUnicode_2BYTE_KIND:
11468 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11469 break;
11470 case PyUnicode_4BYTE_KIND:
11471 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11472 break;
11473 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011474 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011476
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011477 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011478 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011479 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480
Guido van Rossum403d68b2000-03-13 15:55:09 +000011481 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011482}
11483
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484/* Concat to string or Unicode object giving a new Unicode object. */
11485
Alexander Belopolsky40018472011-02-26 01:02:56 +000011486PyObject *
11487PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011489 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011490 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011491 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011493 if (ensure_unicode(left) < 0)
11494 return NULL;
11495
11496 if (!PyUnicode_Check(right)) {
11497 PyErr_Format(PyExc_TypeError,
11498 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011499 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011500 return NULL;
11501 }
11502 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
11505 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011506 PyObject *empty = unicode_get_empty(); // Borrowed reference
11507 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011508 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011509 }
11510 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011511 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011514 left_len = PyUnicode_GET_LENGTH(left);
11515 right_len = PyUnicode_GET_LENGTH(right);
11516 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011517 PyErr_SetString(PyExc_OverflowError,
11518 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011519 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011520 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011521 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011522
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011523 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11524 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011525 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011528 result = PyUnicode_New(new_len, maxchar);
11529 if (result == NULL)
11530 return NULL;
11531 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11532 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11533 assert(_PyUnicode_CheckConsistency(result, 1));
11534 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535}
11536
Walter Dörwald1ab83302007-05-18 17:15:44 +000011537void
Victor Stinner23e56682011-10-03 03:54:37 +020011538PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011539{
Victor Stinner23e56682011-10-03 03:54:37 +020011540 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011541 Py_UCS4 maxchar, maxchar2;
11542 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011543
11544 if (p_left == NULL) {
11545 if (!PyErr_Occurred())
11546 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011547 return;
11548 }
Victor Stinner23e56682011-10-03 03:54:37 +020011549 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011550 if (right == NULL || left == NULL
11551 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011552 if (!PyErr_Occurred())
11553 PyErr_BadInternalCall();
11554 goto error;
11555 }
11556
Benjamin Petersonbac79492012-01-14 13:34:47 -050011557 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011558 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011559 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011560 goto error;
11561
Victor Stinner488fa492011-12-12 00:01:39 +010011562 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011563 PyObject *empty = unicode_get_empty(); // Borrowed reference
11564 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011565 Py_DECREF(left);
11566 Py_INCREF(right);
11567 *p_left = right;
11568 return;
11569 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011570 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011571 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011572 }
Victor Stinner488fa492011-12-12 00:01:39 +010011573
11574 left_len = PyUnicode_GET_LENGTH(left);
11575 right_len = PyUnicode_GET_LENGTH(right);
11576 if (left_len > PY_SSIZE_T_MAX - right_len) {
11577 PyErr_SetString(PyExc_OverflowError,
11578 "strings are too large to concat");
11579 goto error;
11580 }
11581 new_len = left_len + right_len;
11582
11583 if (unicode_modifiable(left)
11584 && PyUnicode_CheckExact(right)
11585 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011586 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11587 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011588 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011589 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011590 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11591 {
11592 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011593 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011594 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011595
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011596 /* copy 'right' into the newly allocated area of 'left' */
11597 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011598 }
Victor Stinner488fa492011-12-12 00:01:39 +010011599 else {
11600 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11601 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011602 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011603
Victor Stinner488fa492011-12-12 00:01:39 +010011604 /* Concat the two Unicode strings */
11605 res = PyUnicode_New(new_len, maxchar);
11606 if (res == NULL)
11607 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011608 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11609 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011610 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011611 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011612 }
11613 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011614 return;
11615
11616error:
Victor Stinner488fa492011-12-12 00:01:39 +010011617 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011618}
11619
11620void
11621PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11622{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011623 PyUnicode_Append(pleft, right);
11624 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011625}
11626
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011627/*
11628Wraps stringlib_parse_args_finds() and additionally ensures that the
11629first argument is a unicode object.
11630*/
11631
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011632static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011633parse_args_finds_unicode(const char * function_name, PyObject *args,
11634 PyObject **substring,
11635 Py_ssize_t *start, Py_ssize_t *end)
11636{
11637 if(stringlib_parse_args_finds(function_name, args, substring,
11638 start, end)) {
11639 if (ensure_unicode(*substring) < 0)
11640 return 0;
11641 return 1;
11642 }
11643 return 0;
11644}
11645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011646PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011647 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011649Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011650string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011651interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652
11653static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011654unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011656 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011657 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011658 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011660 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011661 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011664 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011665 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 kind1 = PyUnicode_KIND(self);
11668 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011669 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011670 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 len1 = PyUnicode_GET_LENGTH(self);
11673 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011675 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011676 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011677
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011678 buf1 = PyUnicode_DATA(self);
11679 buf2 = PyUnicode_DATA(substring);
11680 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011681 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011682 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011683 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011684 }
11685 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 case PyUnicode_1BYTE_KIND:
11687 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011688 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 buf2, len2, PY_SSIZE_T_MAX
11690 );
11691 break;
11692 case PyUnicode_2BYTE_KIND:
11693 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011694 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 buf2, len2, PY_SSIZE_T_MAX
11696 );
11697 break;
11698 case PyUnicode_4BYTE_KIND:
11699 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011700 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 buf2, len2, PY_SSIZE_T_MAX
11702 );
11703 break;
11704 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011705 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 }
11707
11708 result = PyLong_FromSsize_t(iresult);
11709
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011710 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011711 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011712 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714 return result;
11715}
11716
INADA Naoki3ae20562017-01-16 20:41:20 +090011717/*[clinic input]
11718str.encode as unicode_encode
11719
11720 encoding: str(c_default="NULL") = 'utf-8'
11721 The encoding in which to encode the string.
11722 errors: str(c_default="NULL") = 'strict'
11723 The error handling scheme to use for encoding errors.
11724 The default is 'strict' meaning that encoding errors raise a
11725 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11726 'xmlcharrefreplace' as well as any other name registered with
11727 codecs.register_error that can handle UnicodeEncodeErrors.
11728
11729Encode the string using the codec registered for encoding.
11730[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
11732static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011733unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011734/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011736 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011737}
11738
INADA Naoki3ae20562017-01-16 20:41:20 +090011739/*[clinic input]
11740str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741
INADA Naoki3ae20562017-01-16 20:41:20 +090011742 tabsize: int = 8
11743
11744Return a copy where all tab characters are expanded using spaces.
11745
11746If tabsize is not given, a tab size of 8 characters is assumed.
11747[clinic start generated code]*/
11748
11749static PyObject *
11750unicode_expandtabs_impl(PyObject *self, int tabsize)
11751/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011753 Py_ssize_t i, j, line_pos, src_len, incr;
11754 Py_UCS4 ch;
11755 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011756 const void *src_data;
11757 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011758 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011759 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760
Antoine Pitrou22425222011-10-04 19:10:51 +020011761 if (PyUnicode_READY(self) == -1)
11762 return NULL;
11763
Thomas Wouters7e474022000-07-16 12:04:32 +000011764 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011765 src_len = PyUnicode_GET_LENGTH(self);
11766 i = j = line_pos = 0;
11767 kind = PyUnicode_KIND(self);
11768 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011769 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011770 for (; i < src_len; i++) {
11771 ch = PyUnicode_READ(kind, src_data, i);
11772 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011773 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011775 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011777 goto overflow;
11778 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011779 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011780 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011784 goto overflow;
11785 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011787 if (ch == '\n' || ch == '\r')
11788 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011790 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011791 if (!found)
11792 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011793
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011795 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796 if (!u)
11797 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011798 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799
Antoine Pitroue71d5742011-10-04 15:55:09 +020011800 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801
Antoine Pitroue71d5742011-10-04 15:55:09 +020011802 for (; i < src_len; i++) {
11803 ch = PyUnicode_READ(kind, src_data, i);
11804 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011805 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011806 incr = tabsize - (line_pos % tabsize);
11807 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011808 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011809 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011811 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011813 line_pos++;
11814 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011815 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011816 if (ch == '\n' || ch == '\r')
11817 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011819 }
11820 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011821 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011822
Antoine Pitroue71d5742011-10-04 15:55:09 +020011823 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011824 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826}
11827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011828PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830\n\
11831Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011832such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833arguments start and end are interpreted as in slice notation.\n\
11834\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011835Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836
11837static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011840 /* initialize variables to prevent gcc warning */
11841 PyObject *substring = NULL;
11842 Py_ssize_t start = 0;
11843 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011844 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011846 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011849 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011852 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 if (result == -2)
11855 return NULL;
11856
Christian Heimes217cfd12007-12-02 14:31:20 +000011857 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858}
11859
11860static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011861unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011863 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011864 enum PyUnicode_Kind kind;
11865 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011866
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011867 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011868 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011870 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011871 if (PyUnicode_READY(self) == -1) {
11872 return NULL;
11873 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011874 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11875 PyErr_SetString(PyExc_IndexError, "string index out of range");
11876 return NULL;
11877 }
11878 kind = PyUnicode_KIND(self);
11879 data = PyUnicode_DATA(self);
11880 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011881 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882}
11883
Guido van Rossumc2504932007-09-18 19:42:40 +000011884/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011885 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011886static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011887unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011889 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011890
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011891#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011892 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011893#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 if (_PyUnicode_HASH(self) != -1)
11895 return _PyUnicode_HASH(self);
11896 if (PyUnicode_READY(self) == -1)
11897 return -1;
animalizea1d14252019-01-02 20:16:06 +080011898
Christian Heimes985ecdc2013-11-20 11:46:18 +010011899 x = _Py_HashBytes(PyUnicode_DATA(self),
11900 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011902 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903}
11904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011905PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907\n\
oldkaa0735f2018-02-02 16:52:55 +080011908Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011909such that sub is contained within S[start:end]. Optional\n\
11910arguments start and end are interpreted as in slice notation.\n\
11911\n\
11912Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913
11914static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011917 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011918 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011919 PyObject *substring = NULL;
11920 Py_ssize_t start = 0;
11921 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011923 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011926 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011929 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 if (result == -2)
11932 return NULL;
11933
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 if (result < 0) {
11935 PyErr_SetString(PyExc_ValueError, "substring not found");
11936 return NULL;
11937 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011938
Christian Heimes217cfd12007-12-02 14:31:20 +000011939 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940}
11941
INADA Naoki3ae20562017-01-16 20:41:20 +090011942/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011943str.isascii as unicode_isascii
11944
11945Return True if all characters in the string are ASCII, False otherwise.
11946
11947ASCII characters have code points in the range U+0000-U+007F.
11948Empty string is ASCII too.
11949[clinic start generated code]*/
11950
11951static PyObject *
11952unicode_isascii_impl(PyObject *self)
11953/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11954{
11955 if (PyUnicode_READY(self) == -1) {
11956 return NULL;
11957 }
11958 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11959}
11960
11961/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011962str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963
INADA Naoki3ae20562017-01-16 20:41:20 +090011964Return True if the string is a lowercase string, False otherwise.
11965
11966A string is lowercase if all cased characters in the string are lowercase and
11967there is at least one cased character in the string.
11968[clinic start generated code]*/
11969
11970static PyObject *
11971unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011972/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 Py_ssize_t i, length;
11975 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011976 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977 int cased;
11978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 if (PyUnicode_READY(self) == -1)
11980 return NULL;
11981 length = PyUnicode_GET_LENGTH(self);
11982 kind = PyUnicode_KIND(self);
11983 data = PyUnicode_DATA(self);
11984
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 if (length == 1)
11987 return PyBool_FromLong(
11988 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011990 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011992 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011993
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 for (i = 0; i < length; i++) {
11996 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011997
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011999 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 else if (!cased && Py_UNICODE_ISLOWER(ch))
12001 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012003 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004}
12005
INADA Naoki3ae20562017-01-16 20:41:20 +090012006/*[clinic input]
12007str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008
INADA Naoki3ae20562017-01-16 20:41:20 +090012009Return True if the string is an uppercase string, False otherwise.
12010
12011A string is uppercase if all cased characters in the string are uppercase and
12012there is at least one cased character in the string.
12013[clinic start generated code]*/
12014
12015static PyObject *
12016unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012017/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 Py_ssize_t i, length;
12020 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012021 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022 int cased;
12023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 if (PyUnicode_READY(self) == -1)
12025 return NULL;
12026 length = PyUnicode_GET_LENGTH(self);
12027 kind = PyUnicode_KIND(self);
12028 data = PyUnicode_DATA(self);
12029
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 if (length == 1)
12032 return PyBool_FromLong(
12033 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012035 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012037 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012038
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 for (i = 0; i < length; i++) {
12041 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012042
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012044 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 else if (!cased && Py_UNICODE_ISUPPER(ch))
12046 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012048 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049}
12050
INADA Naoki3ae20562017-01-16 20:41:20 +090012051/*[clinic input]
12052str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053
INADA Naoki3ae20562017-01-16 20:41:20 +090012054Return True if the string is a title-cased string, False otherwise.
12055
12056In a title-cased string, upper- and title-case characters may only
12057follow uncased characters and lowercase characters only cased ones.
12058[clinic start generated code]*/
12059
12060static PyObject *
12061unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012062/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 Py_ssize_t i, length;
12065 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012066 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067 int cased, previous_is_cased;
12068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (PyUnicode_READY(self) == -1)
12070 return NULL;
12071 length = PyUnicode_GET_LENGTH(self);
12072 kind = PyUnicode_KIND(self);
12073 data = PyUnicode_DATA(self);
12074
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 if (length == 1) {
12077 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12078 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12079 (Py_UNICODE_ISUPPER(ch) != 0));
12080 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012082 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012084 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012085
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086 cased = 0;
12087 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 for (i = 0; i < length; i++) {
12089 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012090
Benjamin Peterson29060642009-01-31 22:14:21 +000012091 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12092 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012093 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012094 previous_is_cased = 1;
12095 cased = 1;
12096 }
12097 else if (Py_UNICODE_ISLOWER(ch)) {
12098 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012099 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 previous_is_cased = 1;
12101 cased = 1;
12102 }
12103 else
12104 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012106 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107}
12108
INADA Naoki3ae20562017-01-16 20:41:20 +090012109/*[clinic input]
12110str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111
INADA Naoki3ae20562017-01-16 20:41:20 +090012112Return True if the string is a whitespace string, False otherwise.
12113
12114A string is whitespace if all characters in the string are whitespace and there
12115is at least one character in the string.
12116[clinic start generated code]*/
12117
12118static PyObject *
12119unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012120/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 Py_ssize_t i, length;
12123 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012124 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125
12126 if (PyUnicode_READY(self) == -1)
12127 return NULL;
12128 length = PyUnicode_GET_LENGTH(self);
12129 kind = PyUnicode_KIND(self);
12130 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 if (length == 1)
12134 return PyBool_FromLong(
12135 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012137 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012139 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 for (i = 0; i < length; i++) {
12142 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012143 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012144 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012146 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147}
12148
INADA Naoki3ae20562017-01-16 20:41:20 +090012149/*[clinic input]
12150str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012151
INADA Naoki3ae20562017-01-16 20:41:20 +090012152Return True if the string is an alphabetic string, False otherwise.
12153
12154A string is alphabetic if all characters in the string are alphabetic and there
12155is at least one character in the string.
12156[clinic start generated code]*/
12157
12158static PyObject *
12159unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012160/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 Py_ssize_t i, length;
12163 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012164 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165
12166 if (PyUnicode_READY(self) == -1)
12167 return NULL;
12168 length = PyUnicode_GET_LENGTH(self);
12169 kind = PyUnicode_KIND(self);
12170 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012171
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012172 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 if (length == 1)
12174 return PyBool_FromLong(
12175 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012176
12177 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012179 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 for (i = 0; i < length; i++) {
12182 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012183 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012184 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012185 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012186}
12187
INADA Naoki3ae20562017-01-16 20:41:20 +090012188/*[clinic input]
12189str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012190
INADA Naoki3ae20562017-01-16 20:41:20 +090012191Return True if the string is an alpha-numeric string, False otherwise.
12192
12193A string is alpha-numeric if all characters in the string are alpha-numeric and
12194there is at least one character in the string.
12195[clinic start generated code]*/
12196
12197static PyObject *
12198unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012199/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012200{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012202 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 Py_ssize_t len, i;
12204
12205 if (PyUnicode_READY(self) == -1)
12206 return NULL;
12207
12208 kind = PyUnicode_KIND(self);
12209 data = PyUnicode_DATA(self);
12210 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012211
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012212 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 if (len == 1) {
12214 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12215 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12216 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012217
12218 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012220 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 for (i = 0; i < len; i++) {
12223 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012224 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012225 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012226 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012227 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012228}
12229
INADA Naoki3ae20562017-01-16 20:41:20 +090012230/*[clinic input]
12231str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232
INADA Naoki3ae20562017-01-16 20:41:20 +090012233Return True if the string is a decimal string, False otherwise.
12234
12235A string is a decimal string if all characters in the string are decimal and
12236there is at least one character in the string.
12237[clinic start generated code]*/
12238
12239static PyObject *
12240unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012241/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 Py_ssize_t i, length;
12244 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012245 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246
12247 if (PyUnicode_READY(self) == -1)
12248 return NULL;
12249 length = PyUnicode_GET_LENGTH(self);
12250 kind = PyUnicode_KIND(self);
12251 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 if (length == 1)
12255 return PyBool_FromLong(
12256 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012258 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012260 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 for (i = 0; i < length; i++) {
12263 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012264 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012266 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267}
12268
INADA Naoki3ae20562017-01-16 20:41:20 +090012269/*[clinic input]
12270str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271
INADA Naoki3ae20562017-01-16 20:41:20 +090012272Return True if the string is a digit string, False otherwise.
12273
12274A string is a digit string if all characters in the string are digits and there
12275is at least one character in the string.
12276[clinic start generated code]*/
12277
12278static PyObject *
12279unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012280/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 Py_ssize_t i, length;
12283 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012284 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285
12286 if (PyUnicode_READY(self) == -1)
12287 return NULL;
12288 length = PyUnicode_GET_LENGTH(self);
12289 kind = PyUnicode_KIND(self);
12290 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 if (length == 1) {
12294 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12295 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012298 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012300 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 for (i = 0; i < length; i++) {
12303 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012304 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012306 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307}
12308
INADA Naoki3ae20562017-01-16 20:41:20 +090012309/*[clinic input]
12310str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311
INADA Naoki3ae20562017-01-16 20:41:20 +090012312Return True if the string is a numeric string, False otherwise.
12313
12314A string is numeric if all characters in the string are numeric and there is at
12315least one character in the string.
12316[clinic start generated code]*/
12317
12318static PyObject *
12319unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012320/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 Py_ssize_t i, length;
12323 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012324 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325
12326 if (PyUnicode_READY(self) == -1)
12327 return NULL;
12328 length = PyUnicode_GET_LENGTH(self);
12329 kind = PyUnicode_KIND(self);
12330 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 if (length == 1)
12334 return PyBool_FromLong(
12335 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012337 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012339 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 for (i = 0; i < length; i++) {
12342 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012343 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012345 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346}
12347
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012348Py_ssize_t
12349_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012352 if (PyUnicode_READY(self) == -1)
12353 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012354
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012355 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012356 if (len == 0) {
12357 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 }
12360
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012361 int kind = PyUnicode_KIND(self);
12362 const void *data = PyUnicode_DATA(self);
12363 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012364 /* PEP 3131 says that the first character must be in
12365 XID_Start and subsequent characters in XID_Continue,
12366 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012367 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012368 letters, digits, underscore). However, given the current
12369 definition of XID_Start and XID_Continue, it is sufficient
12370 to check just for these, except that _ must be allowed
12371 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012372 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012373 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012374 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012375
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012376 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012377 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012378 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012379 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012380 }
12381 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012382 return i;
12383}
12384
12385int
12386PyUnicode_IsIdentifier(PyObject *self)
12387{
12388 if (PyUnicode_IS_READY(self)) {
12389 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12390 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12391 /* an empty string is not a valid identifier */
12392 return len && i == len;
12393 }
12394 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012395_Py_COMP_DIAG_PUSH
12396_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012397 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012398 if (len == 0) {
12399 /* an empty string is not a valid identifier */
12400 return 0;
12401 }
12402
12403 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012404 Py_UCS4 ch = wstr[i++];
12405#if SIZEOF_WCHAR_T == 2
12406 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12407 && i < len
12408 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12409 {
12410 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12411 i++;
12412 }
12413#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012414 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12415 return 0;
12416 }
12417
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012418 while (i < len) {
12419 ch = wstr[i++];
12420#if SIZEOF_WCHAR_T == 2
12421 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12422 && i < len
12423 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12424 {
12425 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12426 i++;
12427 }
12428#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012429 if (!_PyUnicode_IsXidContinue(ch)) {
12430 return 0;
12431 }
12432 }
12433 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012434_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012435 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012436}
12437
INADA Naoki3ae20562017-01-16 20:41:20 +090012438/*[clinic input]
12439str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012440
INADA Naoki3ae20562017-01-16 20:41:20 +090012441Return True if the string is a valid Python identifier, False otherwise.
12442
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012443Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012444such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012445[clinic start generated code]*/
12446
12447static PyObject *
12448unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012449/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012450{
12451 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12452}
12453
INADA Naoki3ae20562017-01-16 20:41:20 +090012454/*[clinic input]
12455str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012456
INADA Naoki3ae20562017-01-16 20:41:20 +090012457Return True if the string is printable, False otherwise.
12458
12459A string is printable if all of its characters are considered printable in
12460repr() or if it is empty.
12461[clinic start generated code]*/
12462
12463static PyObject *
12464unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012465/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 Py_ssize_t i, length;
12468 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012469 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470
12471 if (PyUnicode_READY(self) == -1)
12472 return NULL;
12473 length = PyUnicode_GET_LENGTH(self);
12474 kind = PyUnicode_KIND(self);
12475 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012476
12477 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 if (length == 1)
12479 return PyBool_FromLong(
12480 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 for (i = 0; i < length; i++) {
12483 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012484 Py_RETURN_FALSE;
12485 }
12486 }
12487 Py_RETURN_TRUE;
12488}
12489
INADA Naoki3ae20562017-01-16 20:41:20 +090012490/*[clinic input]
12491str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492
INADA Naoki3ae20562017-01-16 20:41:20 +090012493 iterable: object
12494 /
12495
12496Concatenate any number of strings.
12497
Martin Panter91a88662017-01-24 00:30:06 +000012498The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012499The result is returned as a new string.
12500
12501Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12502[clinic start generated code]*/
12503
12504static PyObject *
12505unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012506/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507{
INADA Naoki3ae20562017-01-16 20:41:20 +090012508 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509}
12510
Martin v. Löwis18e16552006-02-15 17:27:45 +000012511static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012512unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 if (PyUnicode_READY(self) == -1)
12515 return -1;
12516 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517}
12518
INADA Naoki3ae20562017-01-16 20:41:20 +090012519/*[clinic input]
12520str.ljust as unicode_ljust
12521
12522 width: Py_ssize_t
12523 fillchar: Py_UCS4 = ' '
12524 /
12525
12526Return a left-justified string of length width.
12527
12528Padding is done using the specified fill character (default is a space).
12529[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
12531static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012532unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12533/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012535 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012536 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
Victor Stinnerc4b49542011-12-11 22:44:26 +010012538 if (PyUnicode_GET_LENGTH(self) >= width)
12539 return unicode_result_unchanged(self);
12540
12541 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542}
12543
INADA Naoki3ae20562017-01-16 20:41:20 +090012544/*[clinic input]
12545str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546
INADA Naoki3ae20562017-01-16 20:41:20 +090012547Return a copy of the string converted to lowercase.
12548[clinic start generated code]*/
12549
12550static PyObject *
12551unicode_lower_impl(PyObject *self)
12552/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012554 if (PyUnicode_READY(self) == -1)
12555 return NULL;
12556 if (PyUnicode_IS_ASCII(self))
12557 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012558 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559}
12560
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012561#define LEFTSTRIP 0
12562#define RIGHTSTRIP 1
12563#define BOTHSTRIP 2
12564
12565/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012566static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012567
INADA Naoki3ae20562017-01-16 20:41:20 +090012568#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012569
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012570/* externally visible for str.strip(unicode) */
12571PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012572_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012573{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012574 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 int kind;
12576 Py_ssize_t i, j, len;
12577 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012578 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12581 return NULL;
12582
12583 kind = PyUnicode_KIND(self);
12584 data = PyUnicode_DATA(self);
12585 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012586 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12588 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012589 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012590
Benjamin Peterson14339b62009-01-31 16:36:08 +000012591 i = 0;
12592 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012593 while (i < len) {
12594 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12595 if (!BLOOM(sepmask, ch))
12596 break;
12597 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12598 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012599 i++;
12600 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012601 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012602
Benjamin Peterson14339b62009-01-31 16:36:08 +000012603 j = len;
12604 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012605 j--;
12606 while (j >= i) {
12607 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12608 if (!BLOOM(sepmask, ch))
12609 break;
12610 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12611 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012613 }
12614
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012616 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012617
Victor Stinner7931d9a2011-11-04 00:22:48 +010012618 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619}
12620
12621PyObject*
12622PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12623{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012624 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012626 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627
Victor Stinnerde636f32011-10-01 03:55:54 +020012628 if (PyUnicode_READY(self) == -1)
12629 return NULL;
12630
Victor Stinner684d5fd2012-05-03 02:32:34 +020012631 length = PyUnicode_GET_LENGTH(self);
12632 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012633
Victor Stinner684d5fd2012-05-03 02:32:34 +020012634 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012635 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636
Victor Stinnerde636f32011-10-01 03:55:54 +020012637 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012638 PyErr_SetString(PyExc_IndexError, "string index out of range");
12639 return NULL;
12640 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012641 if (start >= length || end < start)
12642 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012643
Victor Stinner684d5fd2012-05-03 02:32:34 +020012644 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012645 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012646 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012647 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012648 }
12649 else {
12650 kind = PyUnicode_KIND(self);
12651 data = PyUnicode_1BYTE_DATA(self);
12652 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012653 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012654 length);
12655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657
12658static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012659do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 Py_ssize_t len, i, j;
12662
12663 if (PyUnicode_READY(self) == -1)
12664 return NULL;
12665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012667
Victor Stinnercc7af722013-04-09 22:39:24 +020012668 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012669 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012670
12671 i = 0;
12672 if (striptype != RIGHTSTRIP) {
12673 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012674 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012675 if (!_Py_ascii_whitespace[ch])
12676 break;
12677 i++;
12678 }
12679 }
12680
12681 j = len;
12682 if (striptype != LEFTSTRIP) {
12683 j--;
12684 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012685 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012686 if (!_Py_ascii_whitespace[ch])
12687 break;
12688 j--;
12689 }
12690 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012691 }
12692 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012693 else {
12694 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012695 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012696
Victor Stinnercc7af722013-04-09 22:39:24 +020012697 i = 0;
12698 if (striptype != RIGHTSTRIP) {
12699 while (i < len) {
12700 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12701 if (!Py_UNICODE_ISSPACE(ch))
12702 break;
12703 i++;
12704 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012705 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012706
12707 j = len;
12708 if (striptype != LEFTSTRIP) {
12709 j--;
12710 while (j >= i) {
12711 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12712 if (!Py_UNICODE_ISSPACE(ch))
12713 break;
12714 j--;
12715 }
12716 j++;
12717 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012718 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012719
Victor Stinner7931d9a2011-11-04 00:22:48 +010012720 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721}
12722
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012723
12724static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012725do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012726{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012727 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 if (PyUnicode_Check(sep))
12729 return _PyUnicode_XStrip(self, striptype, sep);
12730 else {
12731 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 "%s arg must be None or str",
12733 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012734 return NULL;
12735 }
12736 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012737
Benjamin Peterson14339b62009-01-31 16:36:08 +000012738 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012739}
12740
12741
INADA Naoki3ae20562017-01-16 20:41:20 +090012742/*[clinic input]
12743str.strip as unicode_strip
12744
12745 chars: object = None
12746 /
12747
Zachary Ware09895c22019-10-09 16:09:00 -050012748Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012749
12750If chars is given and not None, remove characters in chars instead.
12751[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012752
12753static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012754unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012755/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012756{
INADA Naoki3ae20562017-01-16 20:41:20 +090012757 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012758}
12759
12760
INADA Naoki3ae20562017-01-16 20:41:20 +090012761/*[clinic input]
12762str.lstrip as unicode_lstrip
12763
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012764 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012765 /
12766
12767Return a copy of the string with leading whitespace removed.
12768
12769If chars is given and not None, remove characters in chars instead.
12770[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012771
12772static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012773unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012774/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012775{
INADA Naoki3ae20562017-01-16 20:41:20 +090012776 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012777}
12778
12779
INADA Naoki3ae20562017-01-16 20:41:20 +090012780/*[clinic input]
12781str.rstrip as unicode_rstrip
12782
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012783 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012784 /
12785
12786Return a copy of the string with trailing whitespace removed.
12787
12788If chars is given and not None, remove characters in chars instead.
12789[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012790
12791static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012792unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012793/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012794{
INADA Naoki3ae20562017-01-16 20:41:20 +090012795 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012796}
12797
12798
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012800unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012802 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804
Serhiy Storchaka05997252013-01-26 12:14:02 +020012805 if (len < 1)
12806 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807
Victor Stinnerc4b49542011-12-11 22:44:26 +010012808 /* no repeat, return original string */
12809 if (len == 1)
12810 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012811
Benjamin Petersonbac79492012-01-14 13:34:47 -050012812 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813 return NULL;
12814
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012815 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012816 PyErr_SetString(PyExc_OverflowError,
12817 "repeated string is too long");
12818 return NULL;
12819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012821
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012822 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823 if (!u)
12824 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012825 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012828 int kind = PyUnicode_KIND(str);
12829 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012830 if (kind == PyUnicode_1BYTE_KIND) {
12831 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012832 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012833 }
12834 else if (kind == PyUnicode_2BYTE_KIND) {
12835 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012836 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012837 ucs2[n] = fill_char;
12838 } else {
12839 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12840 assert(kind == PyUnicode_4BYTE_KIND);
12841 for (n = 0; n < len; ++n)
12842 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 }
12845 else {
12846 /* number of characters copied this far */
12847 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012848 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012850 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012852 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012854 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012855 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857 }
12858
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012859 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012860 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861}
12862
Alexander Belopolsky40018472011-02-26 01:02:56 +000012863PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012864PyUnicode_Replace(PyObject *str,
12865 PyObject *substr,
12866 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012867 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012869 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12870 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012871 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012872 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873}
12874
INADA Naoki3ae20562017-01-16 20:41:20 +090012875/*[clinic input]
12876str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877
INADA Naoki3ae20562017-01-16 20:41:20 +090012878 old: unicode
12879 new: unicode
12880 count: Py_ssize_t = -1
12881 Maximum number of occurrences to replace.
12882 -1 (the default value) means replace all occurrences.
12883 /
12884
12885Return a copy with all occurrences of substring old replaced by new.
12886
12887If the optional argument count is given, only the first count occurrences are
12888replaced.
12889[clinic start generated code]*/
12890
12891static PyObject *
12892unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12893 Py_ssize_t count)
12894/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012896 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012897 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012898 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899}
12900
sweeneydea81849b2020-04-22 17:05:48 -040012901/*[clinic input]
12902str.removeprefix as unicode_removeprefix
12903
12904 prefix: unicode
12905 /
12906
12907Return a str with the given prefix string removed if present.
12908
12909If the string starts with the prefix string, return string[len(prefix):].
12910Otherwise, return a copy of the original string.
12911[clinic start generated code]*/
12912
12913static PyObject *
12914unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12915/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12916{
12917 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12918 if (match == -1) {
12919 return NULL;
12920 }
12921 if (match) {
12922 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12923 PyUnicode_GET_LENGTH(self));
12924 }
12925 return unicode_result_unchanged(self);
12926}
12927
12928/*[clinic input]
12929str.removesuffix as unicode_removesuffix
12930
12931 suffix: unicode
12932 /
12933
12934Return a str with the given suffix string removed if present.
12935
12936If the string ends with the suffix string and that suffix is not empty,
12937return string[:-len(suffix)]. Otherwise, return a copy of the original
12938string.
12939[clinic start generated code]*/
12940
12941static PyObject *
12942unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12943/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12944{
12945 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12946 if (match == -1) {
12947 return NULL;
12948 }
12949 if (match) {
12950 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12951 - PyUnicode_GET_LENGTH(suffix));
12952 }
12953 return unicode_result_unchanged(self);
12954}
12955
Alexander Belopolsky40018472011-02-26 01:02:56 +000012956static PyObject *
12957unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012959 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 Py_ssize_t isize;
12961 Py_ssize_t osize, squote, dquote, i, o;
12962 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012963 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012964 const void *idata;
12965 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012968 return NULL;
12969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 isize = PyUnicode_GET_LENGTH(unicode);
12971 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 /* Compute length of output, quote characters, and
12974 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012975 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 max = 127;
12977 squote = dquote = 0;
12978 ikind = PyUnicode_KIND(unicode);
12979 for (i = 0; i < isize; i++) {
12980 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012981 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012983 case '\'': squote++; break;
12984 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012986 incr = 2;
12987 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 default:
12989 /* Fast-path ASCII */
12990 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012991 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012993 ;
12994 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012997 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012999 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013001 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013003 if (osize > PY_SSIZE_T_MAX - incr) {
13004 PyErr_SetString(PyExc_OverflowError,
13005 "string is too long to generate repr");
13006 return NULL;
13007 }
13008 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 }
13010
13011 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013012 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013014 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 if (dquote)
13016 /* Both squote and dquote present. Use squote,
13017 and escape them */
13018 osize += squote;
13019 else
13020 quote = '"';
13021 }
Victor Stinner55c08782013-04-14 18:45:39 +020013022 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023
13024 repr = PyUnicode_New(osize, max);
13025 if (repr == NULL)
13026 return NULL;
13027 okind = PyUnicode_KIND(repr);
13028 odata = PyUnicode_DATA(repr);
13029
13030 PyUnicode_WRITE(okind, odata, 0, quote);
13031 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013032 if (unchanged) {
13033 _PyUnicode_FastCopyCharacters(repr, 1,
13034 unicode, 0,
13035 isize);
13036 }
13037 else {
13038 for (i = 0, o = 1; i < isize; i++) {
13039 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013040
Victor Stinner55c08782013-04-14 18:45:39 +020013041 /* Escape quotes and backslashes */
13042 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013043 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013045 continue;
13046 }
13047
13048 /* Map special whitespace to '\t', \n', '\r' */
13049 if (ch == '\t') {
13050 PyUnicode_WRITE(okind, odata, o++, '\\');
13051 PyUnicode_WRITE(okind, odata, o++, 't');
13052 }
13053 else if (ch == '\n') {
13054 PyUnicode_WRITE(okind, odata, o++, '\\');
13055 PyUnicode_WRITE(okind, odata, o++, 'n');
13056 }
13057 else if (ch == '\r') {
13058 PyUnicode_WRITE(okind, odata, o++, '\\');
13059 PyUnicode_WRITE(okind, odata, o++, 'r');
13060 }
13061
13062 /* Map non-printable US ASCII to '\xhh' */
13063 else if (ch < ' ' || ch == 0x7F) {
13064 PyUnicode_WRITE(okind, odata, o++, '\\');
13065 PyUnicode_WRITE(okind, odata, o++, 'x');
13066 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13067 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13068 }
13069
13070 /* Copy ASCII characters as-is */
13071 else if (ch < 0x7F) {
13072 PyUnicode_WRITE(okind, odata, o++, ch);
13073 }
13074
13075 /* Non-ASCII characters */
13076 else {
13077 /* Map Unicode whitespace and control characters
13078 (categories Z* and C* except ASCII space)
13079 */
13080 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13081 PyUnicode_WRITE(okind, odata, o++, '\\');
13082 /* Map 8-bit characters to '\xhh' */
13083 if (ch <= 0xff) {
13084 PyUnicode_WRITE(okind, odata, o++, 'x');
13085 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13086 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13087 }
13088 /* Map 16-bit characters to '\uxxxx' */
13089 else if (ch <= 0xffff) {
13090 PyUnicode_WRITE(okind, odata, o++, 'u');
13091 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13092 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13093 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13094 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13095 }
13096 /* Map 21-bit characters to '\U00xxxxxx' */
13097 else {
13098 PyUnicode_WRITE(okind, odata, o++, 'U');
13099 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13100 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13101 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13102 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13104 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13105 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13106 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13107 }
13108 }
13109 /* Copy characters as-is */
13110 else {
13111 PyUnicode_WRITE(okind, odata, o++, ch);
13112 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013113 }
13114 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013116 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013117 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013118 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119}
13120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013121PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123\n\
13124Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013125such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126arguments start and end are interpreted as in slice notation.\n\
13127\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013128Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129
13130static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013133 /* initialize variables to prevent gcc warning */
13134 PyObject *substring = NULL;
13135 Py_ssize_t start = 0;
13136 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013137 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013139 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013140 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013142 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013144
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013145 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147 if (result == -2)
13148 return NULL;
13149
Christian Heimes217cfd12007-12-02 14:31:20 +000013150 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151}
13152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013153PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013156Return the highest index in S where substring sub is found,\n\
13157such that sub is contained within S[start:end]. Optional\n\
13158arguments start and end are interpreted as in slice notation.\n\
13159\n\
13160Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161
13162static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013165 /* initialize variables to prevent gcc warning */
13166 PyObject *substring = NULL;
13167 Py_ssize_t start = 0;
13168 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013169 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013171 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013172 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013173
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013174 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013176
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013177 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013179 if (result == -2)
13180 return NULL;
13181
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182 if (result < 0) {
13183 PyErr_SetString(PyExc_ValueError, "substring not found");
13184 return NULL;
13185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013186
Christian Heimes217cfd12007-12-02 14:31:20 +000013187 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013188}
13189
INADA Naoki3ae20562017-01-16 20:41:20 +090013190/*[clinic input]
13191str.rjust as unicode_rjust
13192
13193 width: Py_ssize_t
13194 fillchar: Py_UCS4 = ' '
13195 /
13196
13197Return a right-justified string of length width.
13198
13199Padding is done using the specified fill character (default is a space).
13200[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201
13202static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013203unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13204/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013205{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013206 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207 return NULL;
13208
Victor Stinnerc4b49542011-12-11 22:44:26 +010013209 if (PyUnicode_GET_LENGTH(self) >= width)
13210 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211
Victor Stinnerc4b49542011-12-11 22:44:26 +010013212 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213}
13214
Alexander Belopolsky40018472011-02-26 01:02:56 +000013215PyObject *
13216PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013218 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013219 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013220
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013221 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222}
13223
INADA Naoki3ae20562017-01-16 20:41:20 +090013224/*[clinic input]
13225str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013226
INADA Naoki3ae20562017-01-16 20:41:20 +090013227 sep: object = None
13228 The delimiter according which to split the string.
13229 None (the default value) means split according to any whitespace,
13230 and discard empty strings from the result.
13231 maxsplit: Py_ssize_t = -1
13232 Maximum number of splits to do.
13233 -1 (the default value) means no limit.
13234
13235Return a list of the words in the string, using sep as the delimiter string.
13236[clinic start generated code]*/
13237
13238static PyObject *
13239unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13240/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241{
INADA Naoki3ae20562017-01-16 20:41:20 +090013242 if (sep == Py_None)
13243 return split(self, NULL, maxsplit);
13244 if (PyUnicode_Check(sep))
13245 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013246
Victor Stinner998b8062018-09-12 00:23:25 +020013247 PyErr_Format(PyExc_TypeError,
13248 "must be str or None, not %.100s",
13249 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251}
13252
Thomas Wouters477c8d52006-05-27 19:21:47 +000013253PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013254PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013255{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013256 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013257 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013258 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013259 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013260
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013261 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013262 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013263
Victor Stinner14f8f022011-10-05 20:58:25 +020013264 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013265 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013266 len1 = PyUnicode_GET_LENGTH(str_obj);
13267 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013268 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013269 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013270 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013271 }
13272 buf1 = PyUnicode_DATA(str_obj);
13273 buf2 = PyUnicode_DATA(sep_obj);
13274 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013275 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013276 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013277 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013278 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013279
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013280 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013281 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013282 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13283 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13284 else
13285 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 break;
13287 case PyUnicode_2BYTE_KIND:
13288 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13289 break;
13290 case PyUnicode_4BYTE_KIND:
13291 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13292 break;
13293 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013294 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013295 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013296
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013297 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013298 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013299 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013300
13301 return out;
13302}
13303
13304
13305PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013306PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013307{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013308 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013309 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013310 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013312
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013313 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013315
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013316 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013317 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 len1 = PyUnicode_GET_LENGTH(str_obj);
13319 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013320 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013321 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013322 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013323 }
13324 buf1 = PyUnicode_DATA(str_obj);
13325 buf2 = PyUnicode_DATA(sep_obj);
13326 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013327 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013328 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013329 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013332 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013333 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013334 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13335 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13336 else
13337 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 break;
13339 case PyUnicode_2BYTE_KIND:
13340 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13341 break;
13342 case PyUnicode_4BYTE_KIND:
13343 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13344 break;
13345 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013346 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013347 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013348
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013349 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013350 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013351 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013352
13353 return out;
13354}
13355
INADA Naoki3ae20562017-01-16 20:41:20 +090013356/*[clinic input]
13357str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013358
INADA Naoki3ae20562017-01-16 20:41:20 +090013359 sep: object
13360 /
13361
13362Partition the string into three parts using the given separator.
13363
13364This will search for the separator in the string. If the separator is found,
13365returns a 3-tuple containing the part before the separator, the separator
13366itself, and the part after it.
13367
13368If the separator is not found, returns a 3-tuple containing the original string
13369and two empty strings.
13370[clinic start generated code]*/
13371
13372static PyObject *
13373unicode_partition(PyObject *self, PyObject *sep)
13374/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013375{
INADA Naoki3ae20562017-01-16 20:41:20 +090013376 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013377}
13378
INADA Naoki3ae20562017-01-16 20:41:20 +090013379/*[clinic input]
13380str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013381
INADA Naoki3ae20562017-01-16 20:41:20 +090013382Partition the string into three parts using the given separator.
13383
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013384This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013385the separator is found, returns a 3-tuple containing the part before the
13386separator, the separator itself, and the part after it.
13387
13388If the separator is not found, returns a 3-tuple containing two empty strings
13389and the original string.
13390[clinic start generated code]*/
13391
13392static PyObject *
13393unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013394/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013395{
INADA Naoki3ae20562017-01-16 20:41:20 +090013396 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013397}
13398
Alexander Belopolsky40018472011-02-26 01:02:56 +000013399PyObject *
13400PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013401{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013402 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013403 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013404
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013405 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013406}
13407
INADA Naoki3ae20562017-01-16 20:41:20 +090013408/*[clinic input]
13409str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013410
INADA Naoki3ae20562017-01-16 20:41:20 +090013411Return a list of the words in the string, using sep as the delimiter string.
13412
13413Splits are done starting at the end of the string and working to the front.
13414[clinic start generated code]*/
13415
13416static PyObject *
13417unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13418/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013419{
INADA Naoki3ae20562017-01-16 20:41:20 +090013420 if (sep == Py_None)
13421 return rsplit(self, NULL, maxsplit);
13422 if (PyUnicode_Check(sep))
13423 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013424
Victor Stinner998b8062018-09-12 00:23:25 +020013425 PyErr_Format(PyExc_TypeError,
13426 "must be str or None, not %.100s",
13427 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013428 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013429}
13430
INADA Naoki3ae20562017-01-16 20:41:20 +090013431/*[clinic input]
13432str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013433
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013434 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013435
13436Return a list of the lines in the string, breaking at line boundaries.
13437
13438Line breaks are not included in the resulting list unless keepends is given and
13439true.
13440[clinic start generated code]*/
13441
13442static PyObject *
13443unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013444/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013446 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013447}
13448
13449static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013450PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013451{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013452 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013453}
13454
INADA Naoki3ae20562017-01-16 20:41:20 +090013455/*[clinic input]
13456str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457
INADA Naoki3ae20562017-01-16 20:41:20 +090013458Convert uppercase characters to lowercase and lowercase characters to uppercase.
13459[clinic start generated code]*/
13460
13461static PyObject *
13462unicode_swapcase_impl(PyObject *self)
13463/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013464{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013465 if (PyUnicode_READY(self) == -1)
13466 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013467 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013468}
13469
Larry Hastings61272b72014-01-07 12:41:53 -080013470/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013471
Larry Hastings31826802013-10-19 00:09:25 -070013472@staticmethod
13473str.maketrans as unicode_maketrans
13474
13475 x: object
13476
13477 y: unicode=NULL
13478
13479 z: unicode=NULL
13480
13481 /
13482
13483Return a translation table usable for str.translate().
13484
13485If there is only one argument, it must be a dictionary mapping Unicode
13486ordinals (integers) or characters to Unicode ordinals, strings or None.
13487Character keys will be then converted to ordinals.
13488If there are two arguments, they must be strings of equal length, and
13489in the resulting dictionary, each character in x will be mapped to the
13490character at the same position in y. If there is a third argument, it
13491must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013492[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013493
Larry Hastings31826802013-10-19 00:09:25 -070013494static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013495unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013496/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013497{
Georg Brandlceee0772007-11-27 23:48:05 +000013498 PyObject *new = NULL, *key, *value;
13499 Py_ssize_t i = 0;
13500 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013501
Georg Brandlceee0772007-11-27 23:48:05 +000013502 new = PyDict_New();
13503 if (!new)
13504 return NULL;
13505 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013506 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013507 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013508
Georg Brandlceee0772007-11-27 23:48:05 +000013509 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013510 if (!PyUnicode_Check(x)) {
13511 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13512 "be a string if there is a second argument");
13513 goto err;
13514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013515 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013516 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13517 "arguments must have equal length");
13518 goto err;
13519 }
13520 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013521 x_kind = PyUnicode_KIND(x);
13522 y_kind = PyUnicode_KIND(y);
13523 x_data = PyUnicode_DATA(x);
13524 y_data = PyUnicode_DATA(y);
13525 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13526 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013527 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013528 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013529 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013530 if (!value) {
13531 Py_DECREF(key);
13532 goto err;
13533 }
Georg Brandlceee0772007-11-27 23:48:05 +000013534 res = PyDict_SetItem(new, key, value);
13535 Py_DECREF(key);
13536 Py_DECREF(value);
13537 if (res < 0)
13538 goto err;
13539 }
13540 /* create entries for deleting chars in z */
13541 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013542 z_kind = PyUnicode_KIND(z);
13543 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013544 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013545 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013546 if (!key)
13547 goto err;
13548 res = PyDict_SetItem(new, key, Py_None);
13549 Py_DECREF(key);
13550 if (res < 0)
13551 goto err;
13552 }
13553 }
13554 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013556 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013557
Georg Brandlceee0772007-11-27 23:48:05 +000013558 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013559 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013560 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13561 "to maketrans it must be a dict");
13562 goto err;
13563 }
13564 /* copy entries into the new dict, converting string keys to int keys */
13565 while (PyDict_Next(x, &i, &key, &value)) {
13566 if (PyUnicode_Check(key)) {
13567 /* convert string keys to integer keys */
13568 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013569 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013570 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13571 "table must be of length 1");
13572 goto err;
13573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013574 kind = PyUnicode_KIND(key);
13575 data = PyUnicode_DATA(key);
13576 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013577 if (!newkey)
13578 goto err;
13579 res = PyDict_SetItem(new, newkey, value);
13580 Py_DECREF(newkey);
13581 if (res < 0)
13582 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013583 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013584 /* just keep integer keys */
13585 if (PyDict_SetItem(new, key, value) < 0)
13586 goto err;
13587 } else {
13588 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13589 "be strings or integers");
13590 goto err;
13591 }
13592 }
13593 }
13594 return new;
13595 err:
13596 Py_DECREF(new);
13597 return NULL;
13598}
13599
INADA Naoki3ae20562017-01-16 20:41:20 +090013600/*[clinic input]
13601str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013602
INADA Naoki3ae20562017-01-16 20:41:20 +090013603 table: object
13604 Translation table, which must be a mapping of Unicode ordinals to
13605 Unicode ordinals, strings, or None.
13606 /
13607
13608Replace each character in the string using the given translation table.
13609
13610The table must implement lookup/indexing via __getitem__, for instance a
13611dictionary or list. If this operation raises LookupError, the character is
13612left untouched. Characters mapped to None are deleted.
13613[clinic start generated code]*/
13614
13615static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013616unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013617/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013618{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013619 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013620}
13621
INADA Naoki3ae20562017-01-16 20:41:20 +090013622/*[clinic input]
13623str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013624
INADA Naoki3ae20562017-01-16 20:41:20 +090013625Return a copy of the string converted to uppercase.
13626[clinic start generated code]*/
13627
13628static PyObject *
13629unicode_upper_impl(PyObject *self)
13630/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013631{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013632 if (PyUnicode_READY(self) == -1)
13633 return NULL;
13634 if (PyUnicode_IS_ASCII(self))
13635 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013636 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013637}
13638
INADA Naoki3ae20562017-01-16 20:41:20 +090013639/*[clinic input]
13640str.zfill as unicode_zfill
13641
13642 width: Py_ssize_t
13643 /
13644
13645Pad a numeric string with zeros on the left, to fill a field of the given width.
13646
13647The string is never truncated.
13648[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013649
13650static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013651unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013652/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013653{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013654 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013655 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013656 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013657 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013658 Py_UCS4 chr;
13659
Benjamin Petersonbac79492012-01-14 13:34:47 -050013660 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013662
Victor Stinnerc4b49542011-12-11 22:44:26 +010013663 if (PyUnicode_GET_LENGTH(self) >= width)
13664 return unicode_result_unchanged(self);
13665
13666 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013667
13668 u = pad(self, fill, 0, '0');
13669
Walter Dörwald068325e2002-04-15 13:36:47 +000013670 if (u == NULL)
13671 return NULL;
13672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013673 kind = PyUnicode_KIND(u);
13674 data = PyUnicode_DATA(u);
13675 chr = PyUnicode_READ(kind, data, fill);
13676
13677 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013678 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013679 PyUnicode_WRITE(kind, data, 0, chr);
13680 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013681 }
13682
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013683 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013684 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013685}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013686
13687#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013688static PyObject *
13689unicode__decimal2ascii(PyObject *self)
13690{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013691 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013692}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013693#endif
13694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013695PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013696 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013697\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013698Return True if S starts with the specified prefix, False otherwise.\n\
13699With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013700With optional end, stop comparing S at that position.\n\
13701prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013702
13703static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013704unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013706{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013707 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013708 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013709 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013710 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013711 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013712
Jesus Ceaac451502011-04-20 17:09:23 +020013713 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013715 if (PyTuple_Check(subobj)) {
13716 Py_ssize_t i;
13717 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013718 substring = PyTuple_GET_ITEM(subobj, i);
13719 if (!PyUnicode_Check(substring)) {
13720 PyErr_Format(PyExc_TypeError,
13721 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013722 "not %.100s",
13723 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013724 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013725 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013726 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013727 if (result == -1)
13728 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013729 if (result) {
13730 Py_RETURN_TRUE;
13731 }
13732 }
13733 /* nothing matched */
13734 Py_RETURN_FALSE;
13735 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013736 if (!PyUnicode_Check(subobj)) {
13737 PyErr_Format(PyExc_TypeError,
13738 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013739 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013740 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013741 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013742 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013743 if (result == -1)
13744 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013745 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013746}
13747
13748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013749PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013750 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013751\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013752Return True if S ends with the specified suffix, False otherwise.\n\
13753With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013754With optional end, stop comparing S at that position.\n\
13755suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013756
13757static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013758unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013759 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013760{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013761 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013762 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013763 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013764 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013765 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013766
Jesus Ceaac451502011-04-20 17:09:23 +020013767 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013768 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013769 if (PyTuple_Check(subobj)) {
13770 Py_ssize_t i;
13771 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013772 substring = PyTuple_GET_ITEM(subobj, i);
13773 if (!PyUnicode_Check(substring)) {
13774 PyErr_Format(PyExc_TypeError,
13775 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013776 "not %.100s",
13777 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013778 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013779 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013780 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013781 if (result == -1)
13782 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013783 if (result) {
13784 Py_RETURN_TRUE;
13785 }
13786 }
13787 Py_RETURN_FALSE;
13788 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013789 if (!PyUnicode_Check(subobj)) {
13790 PyErr_Format(PyExc_TypeError,
13791 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013792 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013793 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013794 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013795 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013796 if (result == -1)
13797 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013798 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013799}
13800
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013801static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013802_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013803{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013804 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13805 writer->data = PyUnicode_DATA(writer->buffer);
13806
13807 if (!writer->readonly) {
13808 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013809 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013810 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013811 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013812 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13813 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13814 writer->kind = PyUnicode_WCHAR_KIND;
13815 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13816
Victor Stinner8f674cc2013-04-17 23:02:17 +020013817 /* Copy-on-write mode: set buffer size to 0 so
13818 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13819 * next write. */
13820 writer->size = 0;
13821 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013822}
13823
Victor Stinnerd3f08822012-05-29 12:57:52 +020013824void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013825_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013826{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013827 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013828
13829 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013830 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013831
13832 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13833 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13834 writer->kind = PyUnicode_WCHAR_KIND;
13835 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013836}
13837
Inada Naoki770847a2019-06-24 12:30:24 +090013838// Initialize _PyUnicodeWriter with initial buffer
13839static inline void
13840_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13841{
13842 memset(writer, 0, sizeof(*writer));
13843 writer->buffer = buffer;
13844 _PyUnicodeWriter_Update(writer);
13845 writer->min_length = writer->size;
13846}
13847
Victor Stinnerd3f08822012-05-29 12:57:52 +020013848int
13849_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13850 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013851{
13852 Py_ssize_t newlen;
13853 PyObject *newbuffer;
13854
Victor Stinner2740e462016-09-06 16:58:36 -070013855 assert(maxchar <= MAX_UNICODE);
13856
Victor Stinnerca9381e2015-09-22 00:58:32 +020013857 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013858 assert((maxchar > writer->maxchar && length >= 0)
13859 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013860
Victor Stinner202fdca2012-05-07 12:47:02 +020013861 if (length > PY_SSIZE_T_MAX - writer->pos) {
13862 PyErr_NoMemory();
13863 return -1;
13864 }
13865 newlen = writer->pos + length;
13866
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013867 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013868
Victor Stinnerd3f08822012-05-29 12:57:52 +020013869 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013870 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013871 if (writer->overallocate
13872 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13873 /* overallocate to limit the number of realloc() */
13874 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013875 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013876 if (newlen < writer->min_length)
13877 newlen = writer->min_length;
13878
Victor Stinnerd3f08822012-05-29 12:57:52 +020013879 writer->buffer = PyUnicode_New(newlen, maxchar);
13880 if (writer->buffer == NULL)
13881 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013882 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013883 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013884 if (writer->overallocate
13885 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13886 /* overallocate to limit the number of realloc() */
13887 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013888 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013889 if (newlen < writer->min_length)
13890 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013891
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013892 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013893 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013894 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013895 newbuffer = PyUnicode_New(newlen, maxchar);
13896 if (newbuffer == NULL)
13897 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013898 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13899 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013900 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013901 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013902 }
13903 else {
13904 newbuffer = resize_compact(writer->buffer, newlen);
13905 if (newbuffer == NULL)
13906 return -1;
13907 }
13908 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013909 }
13910 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013911 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013912 newbuffer = PyUnicode_New(writer->size, maxchar);
13913 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013914 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013915 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13916 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013917 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013918 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013919 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013920 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013921
13922#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013923}
13924
Victor Stinnerca9381e2015-09-22 00:58:32 +020013925int
13926_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13927 enum PyUnicode_Kind kind)
13928{
13929 Py_UCS4 maxchar;
13930
13931 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13932 assert(writer->kind < kind);
13933
13934 switch (kind)
13935 {
13936 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13937 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13938 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13939 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013940 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013941 }
13942
13943 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13944}
13945
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013946static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013947_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013948{
Victor Stinner2740e462016-09-06 16:58:36 -070013949 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013950 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13951 return -1;
13952 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13953 writer->pos++;
13954 return 0;
13955}
13956
13957int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013958_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13959{
13960 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13961}
13962
13963int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013964_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13965{
13966 Py_UCS4 maxchar;
13967 Py_ssize_t len;
13968
13969 if (PyUnicode_READY(str) == -1)
13970 return -1;
13971 len = PyUnicode_GET_LENGTH(str);
13972 if (len == 0)
13973 return 0;
13974 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13975 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013976 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013977 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013978 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013979 Py_INCREF(str);
13980 writer->buffer = str;
13981 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013982 writer->pos += len;
13983 return 0;
13984 }
13985 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13986 return -1;
13987 }
13988 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13989 str, 0, len);
13990 writer->pos += len;
13991 return 0;
13992}
13993
Victor Stinnere215d962012-10-06 23:03:36 +020013994int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013995_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13996 Py_ssize_t start, Py_ssize_t end)
13997{
13998 Py_UCS4 maxchar;
13999 Py_ssize_t len;
14000
14001 if (PyUnicode_READY(str) == -1)
14002 return -1;
14003
14004 assert(0 <= start);
14005 assert(end <= PyUnicode_GET_LENGTH(str));
14006 assert(start <= end);
14007
14008 if (end == 0)
14009 return 0;
14010
14011 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14012 return _PyUnicodeWriter_WriteStr(writer, str);
14013
14014 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14015 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14016 else
14017 maxchar = writer->maxchar;
14018 len = end - start;
14019
14020 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14021 return -1;
14022
14023 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14024 str, start, len);
14025 writer->pos += len;
14026 return 0;
14027}
14028
14029int
Victor Stinner4a587072013-11-19 12:54:53 +010014030_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14031 const char *ascii, Py_ssize_t len)
14032{
14033 if (len == -1)
14034 len = strlen(ascii);
14035
Andy Lestere6be9b52020-02-11 20:28:35 -060014036 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014037
14038 if (writer->buffer == NULL && !writer->overallocate) {
14039 PyObject *str;
14040
14041 str = _PyUnicode_FromASCII(ascii, len);
14042 if (str == NULL)
14043 return -1;
14044
14045 writer->readonly = 1;
14046 writer->buffer = str;
14047 _PyUnicodeWriter_Update(writer);
14048 writer->pos += len;
14049 return 0;
14050 }
14051
14052 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14053 return -1;
14054
14055 switch (writer->kind)
14056 {
14057 case PyUnicode_1BYTE_KIND:
14058 {
14059 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14060 Py_UCS1 *data = writer->data;
14061
Christian Heimesf051e432016-09-13 20:22:02 +020014062 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014063 break;
14064 }
14065 case PyUnicode_2BYTE_KIND:
14066 {
14067 _PyUnicode_CONVERT_BYTES(
14068 Py_UCS1, Py_UCS2,
14069 ascii, ascii + len,
14070 (Py_UCS2 *)writer->data + writer->pos);
14071 break;
14072 }
14073 case PyUnicode_4BYTE_KIND:
14074 {
14075 _PyUnicode_CONVERT_BYTES(
14076 Py_UCS1, Py_UCS4,
14077 ascii, ascii + len,
14078 (Py_UCS4 *)writer->data + writer->pos);
14079 break;
14080 }
14081 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014082 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014083 }
14084
14085 writer->pos += len;
14086 return 0;
14087}
14088
14089int
14090_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14091 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014092{
14093 Py_UCS4 maxchar;
14094
Andy Lestere6be9b52020-02-11 20:28:35 -060014095 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014096 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14097 return -1;
14098 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14099 writer->pos += len;
14100 return 0;
14101}
14102
Victor Stinnerd3f08822012-05-29 12:57:52 +020014103PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014104_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014105{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014106 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014107
Victor Stinnerd3f08822012-05-29 12:57:52 +020014108 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014109 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014110 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014111 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014112
14113 str = writer->buffer;
14114 writer->buffer = NULL;
14115
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014116 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014117 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14118 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014119 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014120
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014121 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14122 PyObject *str2;
14123 str2 = resize_compact(str, writer->pos);
14124 if (str2 == NULL) {
14125 Py_DECREF(str);
14126 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014127 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014128 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014129 }
14130
Victor Stinner15a0bd32013-07-08 22:29:55 +020014131 assert(_PyUnicode_CheckConsistency(str, 1));
14132 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014133}
14134
Victor Stinnerd3f08822012-05-29 12:57:52 +020014135void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014136_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014137{
14138 Py_CLEAR(writer->buffer);
14139}
14140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014141#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014142
14143PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014144 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014145\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014146Return a formatted version of S, using substitutions from args and kwargs.\n\
14147The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014148
Eric Smith27bbca62010-11-04 17:06:58 +000014149PyDoc_STRVAR(format_map__doc__,
14150 "S.format_map(mapping) -> str\n\
14151\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014152Return a formatted version of S, using substitutions from mapping.\n\
14153The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014154
INADA Naoki3ae20562017-01-16 20:41:20 +090014155/*[clinic input]
14156str.__format__ as unicode___format__
14157
14158 format_spec: unicode
14159 /
14160
14161Return a formatted version of the string as described by format_spec.
14162[clinic start generated code]*/
14163
Eric Smith4a7d76d2008-05-30 18:10:19 +000014164static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014165unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014166/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014167{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014168 _PyUnicodeWriter writer;
14169 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014170
Victor Stinnerd3f08822012-05-29 12:57:52 +020014171 if (PyUnicode_READY(self) == -1)
14172 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014173 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014174 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14175 self, format_spec, 0,
14176 PyUnicode_GET_LENGTH(format_spec));
14177 if (ret == -1) {
14178 _PyUnicodeWriter_Dealloc(&writer);
14179 return NULL;
14180 }
14181 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014182}
14183
INADA Naoki3ae20562017-01-16 20:41:20 +090014184/*[clinic input]
14185str.__sizeof__ as unicode_sizeof
14186
14187Return the size of the string in memory, in bytes.
14188[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014189
14190static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014191unicode_sizeof_impl(PyObject *self)
14192/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014194 Py_ssize_t size;
14195
14196 /* If it's a compact object, account for base structure +
14197 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014198 if (PyUnicode_IS_COMPACT_ASCII(self))
14199 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14200 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014201 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014202 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014203 else {
14204 /* If it is a two-block object, account for base object, and
14205 for character block if present. */
14206 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014207 if (_PyUnicode_DATA_ANY(self))
14208 size += (PyUnicode_GET_LENGTH(self) + 1) *
14209 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014210 }
14211 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014212 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014213 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14214 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14215 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14216 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014217
14218 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014219}
14220
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014221static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014222unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014223{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014224 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014225 if (!copy)
14226 return NULL;
14227 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014228}
14229
Guido van Rossumd57fd912000-03-10 22:53:23 +000014230static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014231 UNICODE_ENCODE_METHODDEF
14232 UNICODE_REPLACE_METHODDEF
14233 UNICODE_SPLIT_METHODDEF
14234 UNICODE_RSPLIT_METHODDEF
14235 UNICODE_JOIN_METHODDEF
14236 UNICODE_CAPITALIZE_METHODDEF
14237 UNICODE_CASEFOLD_METHODDEF
14238 UNICODE_TITLE_METHODDEF
14239 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014240 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014241 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014242 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014243 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014244 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014245 UNICODE_LJUST_METHODDEF
14246 UNICODE_LOWER_METHODDEF
14247 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014248 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14249 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014250 UNICODE_RJUST_METHODDEF
14251 UNICODE_RSTRIP_METHODDEF
14252 UNICODE_RPARTITION_METHODDEF
14253 UNICODE_SPLITLINES_METHODDEF
14254 UNICODE_STRIP_METHODDEF
14255 UNICODE_SWAPCASE_METHODDEF
14256 UNICODE_TRANSLATE_METHODDEF
14257 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014258 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14259 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014260 UNICODE_REMOVEPREFIX_METHODDEF
14261 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014262 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014263 UNICODE_ISLOWER_METHODDEF
14264 UNICODE_ISUPPER_METHODDEF
14265 UNICODE_ISTITLE_METHODDEF
14266 UNICODE_ISSPACE_METHODDEF
14267 UNICODE_ISDECIMAL_METHODDEF
14268 UNICODE_ISDIGIT_METHODDEF
14269 UNICODE_ISNUMERIC_METHODDEF
14270 UNICODE_ISALPHA_METHODDEF
14271 UNICODE_ISALNUM_METHODDEF
14272 UNICODE_ISIDENTIFIER_METHODDEF
14273 UNICODE_ISPRINTABLE_METHODDEF
14274 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014275 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014276 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014277 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014278 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014279 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014280#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014281 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014282 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014283#endif
14284
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014285 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014286 {NULL, NULL}
14287};
14288
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014289static PyObject *
14290unicode_mod(PyObject *v, PyObject *w)
14291{
Brian Curtindfc80e32011-08-10 20:28:54 -050014292 if (!PyUnicode_Check(v))
14293 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014294 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014295}
14296
14297static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014298 0, /*nb_add*/
14299 0, /*nb_subtract*/
14300 0, /*nb_multiply*/
14301 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014302};
14303
Guido van Rossumd57fd912000-03-10 22:53:23 +000014304static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014305 (lenfunc) unicode_length, /* sq_length */
14306 PyUnicode_Concat, /* sq_concat */
14307 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14308 (ssizeargfunc) unicode_getitem, /* sq_item */
14309 0, /* sq_slice */
14310 0, /* sq_ass_item */
14311 0, /* sq_ass_slice */
14312 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014313};
14314
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014315static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014316unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014317{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014318 if (PyUnicode_READY(self) == -1)
14319 return NULL;
14320
Victor Stinnera15e2602020-04-08 02:01:56 +020014321 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014322 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014323 if (i == -1 && PyErr_Occurred())
14324 return NULL;
14325 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014326 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014327 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014328 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014329 Py_ssize_t start, stop, step, slicelength, i;
14330 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014331 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014332 const void *src_data;
14333 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014334 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014335 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014336
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014337 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014338 return NULL;
14339 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014340 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14341 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014342
14343 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014344 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014345 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014346 slicelength == PyUnicode_GET_LENGTH(self)) {
14347 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014348 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014349 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014350 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014351 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014352 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014353 src_kind = PyUnicode_KIND(self);
14354 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014355 if (!PyUnicode_IS_ASCII(self)) {
14356 kind_limit = kind_maxchar_limit(src_kind);
14357 max_char = 0;
14358 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14359 ch = PyUnicode_READ(src_kind, src_data, cur);
14360 if (ch > max_char) {
14361 max_char = ch;
14362 if (max_char >= kind_limit)
14363 break;
14364 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014365 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014366 }
Victor Stinner55c99112011-10-13 01:17:06 +020014367 else
14368 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014369 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014370 if (result == NULL)
14371 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014372 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014373 dest_data = PyUnicode_DATA(result);
14374
14375 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014376 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14377 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014378 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014379 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014380 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014381 } else {
14382 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14383 return NULL;
14384 }
14385}
14386
14387static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014388 (lenfunc)unicode_length, /* mp_length */
14389 (binaryfunc)unicode_subscript, /* mp_subscript */
14390 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014391};
14392
Guido van Rossumd57fd912000-03-10 22:53:23 +000014393
Guido van Rossumd57fd912000-03-10 22:53:23 +000014394/* Helpers for PyUnicode_Format() */
14395
Victor Stinnera47082312012-10-04 02:19:54 +020014396struct unicode_formatter_t {
14397 PyObject *args;
14398 int args_owned;
14399 Py_ssize_t arglen, argidx;
14400 PyObject *dict;
14401
14402 enum PyUnicode_Kind fmtkind;
14403 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014404 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014405 PyObject *fmtstr;
14406
14407 _PyUnicodeWriter writer;
14408};
14409
14410struct unicode_format_arg_t {
14411 Py_UCS4 ch;
14412 int flags;
14413 Py_ssize_t width;
14414 int prec;
14415 int sign;
14416};
14417
Guido van Rossumd57fd912000-03-10 22:53:23 +000014418static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014419unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014420{
Victor Stinnera47082312012-10-04 02:19:54 +020014421 Py_ssize_t argidx = ctx->argidx;
14422
14423 if (argidx < ctx->arglen) {
14424 ctx->argidx++;
14425 if (ctx->arglen < 0)
14426 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014427 else
Victor Stinnera47082312012-10-04 02:19:54 +020014428 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014429 }
14430 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014431 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014432 return NULL;
14433}
14434
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014435/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014436
Victor Stinnera47082312012-10-04 02:19:54 +020014437/* Format a float into the writer if the writer is not NULL, or into *p_output
14438 otherwise.
14439
14440 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014441static int
Victor Stinnera47082312012-10-04 02:19:54 +020014442formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14443 PyObject **p_output,
14444 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014445{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014446 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014447 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014448 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014449 int prec;
14450 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014451
Guido van Rossumd57fd912000-03-10 22:53:23 +000014452 x = PyFloat_AsDouble(v);
14453 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014454 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014455
Victor Stinnera47082312012-10-04 02:19:54 +020014456 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014457 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014458 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014459
Victor Stinnera47082312012-10-04 02:19:54 +020014460 if (arg->flags & F_ALT)
14461 dtoa_flags = Py_DTSF_ALT;
14462 else
14463 dtoa_flags = 0;
14464 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014465 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014466 return -1;
14467 len = strlen(p);
14468 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014469 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014470 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014471 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014472 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014473 }
14474 else
14475 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014476 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014477 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014478}
14479
Victor Stinnerd0880d52012-04-27 23:40:13 +020014480/* formatlong() emulates the format codes d, u, o, x and X, and
14481 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14482 * Python's regular ints.
14483 * Return value: a new PyUnicodeObject*, or NULL if error.
14484 * The output string is of the form
14485 * "-"? ("0x" | "0X")? digit+
14486 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14487 * set in flags. The case of hex digits will be correct,
14488 * There will be at least prec digits, zero-filled on the left if
14489 * necessary to get that many.
14490 * val object to be converted
14491 * flags bitmask of format flags; only F_ALT is looked at
14492 * prec minimum number of digits; 0-fill on left if needed
14493 * type a character in [duoxX]; u acts the same as d
14494 *
14495 * CAUTION: o, x and X conversions on regular ints can never
14496 * produce a '-' sign, but can for Python's unbounded ints.
14497 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014498PyObject *
14499_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014500{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014501 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014502 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014503 Py_ssize_t i;
14504 int sign; /* 1 if '-', else 0 */
14505 int len; /* number of characters */
14506 Py_ssize_t llen;
14507 int numdigits; /* len == numnondigits + numdigits */
14508 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014509
Victor Stinnerd0880d52012-04-27 23:40:13 +020014510 /* Avoid exceeding SSIZE_T_MAX */
14511 if (prec > INT_MAX-3) {
14512 PyErr_SetString(PyExc_OverflowError,
14513 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014514 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014515 }
14516
14517 assert(PyLong_Check(val));
14518
14519 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014520 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014521 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014522 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014523 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014524 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014525 /* int and int subclasses should print numerically when a numeric */
14526 /* format code is used (see issue18780) */
14527 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014528 break;
14529 case 'o':
14530 numnondigits = 2;
14531 result = PyNumber_ToBase(val, 8);
14532 break;
14533 case 'x':
14534 case 'X':
14535 numnondigits = 2;
14536 result = PyNumber_ToBase(val, 16);
14537 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014538 }
14539 if (!result)
14540 return NULL;
14541
14542 assert(unicode_modifiable(result));
14543 assert(PyUnicode_IS_READY(result));
14544 assert(PyUnicode_IS_ASCII(result));
14545
14546 /* To modify the string in-place, there can only be one reference. */
14547 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014548 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014549 PyErr_BadInternalCall();
14550 return NULL;
14551 }
14552 buf = PyUnicode_DATA(result);
14553 llen = PyUnicode_GET_LENGTH(result);
14554 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014555 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014556 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014557 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014558 return NULL;
14559 }
14560 len = (int)llen;
14561 sign = buf[0] == '-';
14562 numnondigits += sign;
14563 numdigits = len - numnondigits;
14564 assert(numdigits > 0);
14565
14566 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014567 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014568 (type == 'o' || type == 'x' || type == 'X'))) {
14569 assert(buf[sign] == '0');
14570 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14571 buf[sign+1] == 'o');
14572 numnondigits -= 2;
14573 buf += 2;
14574 len -= 2;
14575 if (sign)
14576 buf[0] = '-';
14577 assert(len == numnondigits + numdigits);
14578 assert(numdigits > 0);
14579 }
14580
14581 /* Fill with leading zeroes to meet minimum width. */
14582 if (prec > numdigits) {
14583 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14584 numnondigits + prec);
14585 char *b1;
14586 if (!r1) {
14587 Py_DECREF(result);
14588 return NULL;
14589 }
14590 b1 = PyBytes_AS_STRING(r1);
14591 for (i = 0; i < numnondigits; ++i)
14592 *b1++ = *buf++;
14593 for (i = 0; i < prec - numdigits; i++)
14594 *b1++ = '0';
14595 for (i = 0; i < numdigits; i++)
14596 *b1++ = *buf++;
14597 *b1 = '\0';
14598 Py_DECREF(result);
14599 result = r1;
14600 buf = PyBytes_AS_STRING(result);
14601 len = numnondigits + prec;
14602 }
14603
14604 /* Fix up case for hex conversions. */
14605 if (type == 'X') {
14606 /* Need to convert all lower case letters to upper case.
14607 and need to convert 0x to 0X (and -0x to -0X). */
14608 for (i = 0; i < len; i++)
14609 if (buf[i] >= 'a' && buf[i] <= 'x')
14610 buf[i] -= 'a'-'A';
14611 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014612 if (!PyUnicode_Check(result)
14613 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014614 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014615 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014616 Py_DECREF(result);
14617 result = unicode;
14618 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014619 else if (len != PyUnicode_GET_LENGTH(result)) {
14620 if (PyUnicode_Resize(&result, len) < 0)
14621 Py_CLEAR(result);
14622 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014623 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014624}
14625
Ethan Furmandf3ed242014-01-05 06:50:30 -080014626/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014627 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014628 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014629 * -1 and raise an exception on error */
14630static int
Victor Stinnera47082312012-10-04 02:19:54 +020014631mainformatlong(PyObject *v,
14632 struct unicode_format_arg_t *arg,
14633 PyObject **p_output,
14634 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014635{
14636 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014637 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014638
14639 if (!PyNumber_Check(v))
14640 goto wrongtype;
14641
Ethan Furman9ab74802014-03-21 06:38:46 -070014642 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014643 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014644 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014645 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014646 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014647 if (PyErr_ExceptionMatches(PyExc_TypeError))
14648 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014649 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014650 }
14651 }
14652 else {
14653 iobj = PyNumber_Long(v);
14654 if (iobj == NULL ) {
14655 if (PyErr_ExceptionMatches(PyExc_TypeError))
14656 goto wrongtype;
14657 return -1;
14658 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014659 }
14660 assert(PyLong_Check(iobj));
14661 }
14662 else {
14663 iobj = v;
14664 Py_INCREF(iobj);
14665 }
14666
14667 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014668 && arg->width == -1 && arg->prec == -1
14669 && !(arg->flags & (F_SIGN | F_BLANK))
14670 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014671 {
14672 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014673 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014674 int base;
14675
Victor Stinnera47082312012-10-04 02:19:54 +020014676 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014677 {
14678 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014679 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014680 case 'd':
14681 case 'i':
14682 case 'u':
14683 base = 10;
14684 break;
14685 case 'o':
14686 base = 8;
14687 break;
14688 case 'x':
14689 case 'X':
14690 base = 16;
14691 break;
14692 }
14693
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014694 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14695 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014696 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014697 }
14698 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014699 return 1;
14700 }
14701
Ethan Furmanb95b5612015-01-23 20:05:18 -080014702 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014703 Py_DECREF(iobj);
14704 if (res == NULL)
14705 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014706 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014707 return 0;
14708
14709wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014710 switch(type)
14711 {
14712 case 'o':
14713 case 'x':
14714 case 'X':
14715 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014716 "%%%c format: an integer is required, "
14717 "not %.200s",
14718 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014719 break;
14720 default:
14721 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014722 "%%%c format: a number is required, "
14723 "not %.200s",
14724 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014725 break;
14726 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014727 return -1;
14728}
14729
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014730static Py_UCS4
14731formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014732{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014733 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014734 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014735 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014736 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014737 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014738 goto onError;
14739 }
14740 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014741 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014742 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014743 /* make sure number is a type of integer */
14744 if (!PyLong_Check(v)) {
14745 iobj = PyNumber_Index(v);
14746 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014747 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014748 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014749 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014750 Py_DECREF(iobj);
14751 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014752 else {
14753 x = PyLong_AsLong(v);
14754 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014755 if (x == -1 && PyErr_Occurred())
14756 goto onError;
14757
Victor Stinner8faf8212011-12-08 22:14:11 +010014758 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014759 PyErr_SetString(PyExc_OverflowError,
14760 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014761 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014762 }
14763
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014764 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014765 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014766
Benjamin Peterson29060642009-01-31 22:14:21 +000014767 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014768 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014769 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014770 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014771}
14772
Victor Stinnera47082312012-10-04 02:19:54 +020014773/* Parse options of an argument: flags, width, precision.
14774 Handle also "%(name)" syntax.
14775
14776 Return 0 if the argument has been formatted into arg->str.
14777 Return 1 if the argument has been written into ctx->writer,
14778 Raise an exception and return -1 on error. */
14779static int
14780unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14781 struct unicode_format_arg_t *arg)
14782{
14783#define FORMAT_READ(ctx) \
14784 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14785
14786 PyObject *v;
14787
Victor Stinnera47082312012-10-04 02:19:54 +020014788 if (arg->ch == '(') {
14789 /* Get argument value from a dictionary. Example: "%(name)s". */
14790 Py_ssize_t keystart;
14791 Py_ssize_t keylen;
14792 PyObject *key;
14793 int pcount = 1;
14794
14795 if (ctx->dict == NULL) {
14796 PyErr_SetString(PyExc_TypeError,
14797 "format requires a mapping");
14798 return -1;
14799 }
14800 ++ctx->fmtpos;
14801 --ctx->fmtcnt;
14802 keystart = ctx->fmtpos;
14803 /* Skip over balanced parentheses */
14804 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14805 arg->ch = FORMAT_READ(ctx);
14806 if (arg->ch == ')')
14807 --pcount;
14808 else if (arg->ch == '(')
14809 ++pcount;
14810 ctx->fmtpos++;
14811 }
14812 keylen = ctx->fmtpos - keystart - 1;
14813 if (ctx->fmtcnt < 0 || pcount > 0) {
14814 PyErr_SetString(PyExc_ValueError,
14815 "incomplete format key");
14816 return -1;
14817 }
14818 key = PyUnicode_Substring(ctx->fmtstr,
14819 keystart, keystart + keylen);
14820 if (key == NULL)
14821 return -1;
14822 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014823 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014824 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014825 }
14826 ctx->args = PyObject_GetItem(ctx->dict, key);
14827 Py_DECREF(key);
14828 if (ctx->args == NULL)
14829 return -1;
14830 ctx->args_owned = 1;
14831 ctx->arglen = -1;
14832 ctx->argidx = -2;
14833 }
14834
14835 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014836 while (--ctx->fmtcnt >= 0) {
14837 arg->ch = FORMAT_READ(ctx);
14838 ctx->fmtpos++;
14839 switch (arg->ch) {
14840 case '-': arg->flags |= F_LJUST; continue;
14841 case '+': arg->flags |= F_SIGN; continue;
14842 case ' ': arg->flags |= F_BLANK; continue;
14843 case '#': arg->flags |= F_ALT; continue;
14844 case '0': arg->flags |= F_ZERO; continue;
14845 }
14846 break;
14847 }
14848
14849 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014850 if (arg->ch == '*') {
14851 v = unicode_format_getnextarg(ctx);
14852 if (v == NULL)
14853 return -1;
14854 if (!PyLong_Check(v)) {
14855 PyErr_SetString(PyExc_TypeError,
14856 "* wants int");
14857 return -1;
14858 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014859 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014860 if (arg->width == -1 && PyErr_Occurred())
14861 return -1;
14862 if (arg->width < 0) {
14863 arg->flags |= F_LJUST;
14864 arg->width = -arg->width;
14865 }
14866 if (--ctx->fmtcnt >= 0) {
14867 arg->ch = FORMAT_READ(ctx);
14868 ctx->fmtpos++;
14869 }
14870 }
14871 else if (arg->ch >= '0' && arg->ch <= '9') {
14872 arg->width = arg->ch - '0';
14873 while (--ctx->fmtcnt >= 0) {
14874 arg->ch = FORMAT_READ(ctx);
14875 ctx->fmtpos++;
14876 if (arg->ch < '0' || arg->ch > '9')
14877 break;
14878 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14879 mixing signed and unsigned comparison. Since arg->ch is between
14880 '0' and '9', casting to int is safe. */
14881 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14882 PyErr_SetString(PyExc_ValueError,
14883 "width too big");
14884 return -1;
14885 }
14886 arg->width = arg->width*10 + (arg->ch - '0');
14887 }
14888 }
14889
14890 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014891 if (arg->ch == '.') {
14892 arg->prec = 0;
14893 if (--ctx->fmtcnt >= 0) {
14894 arg->ch = FORMAT_READ(ctx);
14895 ctx->fmtpos++;
14896 }
14897 if (arg->ch == '*') {
14898 v = unicode_format_getnextarg(ctx);
14899 if (v == NULL)
14900 return -1;
14901 if (!PyLong_Check(v)) {
14902 PyErr_SetString(PyExc_TypeError,
14903 "* wants int");
14904 return -1;
14905 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014906 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014907 if (arg->prec == -1 && PyErr_Occurred())
14908 return -1;
14909 if (arg->prec < 0)
14910 arg->prec = 0;
14911 if (--ctx->fmtcnt >= 0) {
14912 arg->ch = FORMAT_READ(ctx);
14913 ctx->fmtpos++;
14914 }
14915 }
14916 else if (arg->ch >= '0' && arg->ch <= '9') {
14917 arg->prec = arg->ch - '0';
14918 while (--ctx->fmtcnt >= 0) {
14919 arg->ch = FORMAT_READ(ctx);
14920 ctx->fmtpos++;
14921 if (arg->ch < '0' || arg->ch > '9')
14922 break;
14923 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14924 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014925 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014926 return -1;
14927 }
14928 arg->prec = arg->prec*10 + (arg->ch - '0');
14929 }
14930 }
14931 }
14932
14933 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14934 if (ctx->fmtcnt >= 0) {
14935 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14936 if (--ctx->fmtcnt >= 0) {
14937 arg->ch = FORMAT_READ(ctx);
14938 ctx->fmtpos++;
14939 }
14940 }
14941 }
14942 if (ctx->fmtcnt < 0) {
14943 PyErr_SetString(PyExc_ValueError,
14944 "incomplete format");
14945 return -1;
14946 }
14947 return 0;
14948
14949#undef FORMAT_READ
14950}
14951
14952/* Format one argument. Supported conversion specifiers:
14953
14954 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014955 - "i", "d", "u": int or float
14956 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014957 - "e", "E", "f", "F", "g", "G": float
14958 - "c": int or str (1 character)
14959
Victor Stinner8dbd4212012-12-04 09:30:24 +010014960 When possible, the output is written directly into the Unicode writer
14961 (ctx->writer). A string is created when padding is required.
14962
Victor Stinnera47082312012-10-04 02:19:54 +020014963 Return 0 if the argument has been formatted into *p_str,
14964 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014965 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014966static int
14967unicode_format_arg_format(struct unicode_formatter_t *ctx,
14968 struct unicode_format_arg_t *arg,
14969 PyObject **p_str)
14970{
14971 PyObject *v;
14972 _PyUnicodeWriter *writer = &ctx->writer;
14973
14974 if (ctx->fmtcnt == 0)
14975 ctx->writer.overallocate = 0;
14976
Victor Stinnera47082312012-10-04 02:19:54 +020014977 v = unicode_format_getnextarg(ctx);
14978 if (v == NULL)
14979 return -1;
14980
Victor Stinnera47082312012-10-04 02:19:54 +020014981
14982 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014983 case 's':
14984 case 'r':
14985 case 'a':
14986 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14987 /* Fast path */
14988 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14989 return -1;
14990 return 1;
14991 }
14992
14993 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14994 *p_str = v;
14995 Py_INCREF(*p_str);
14996 }
14997 else {
14998 if (arg->ch == 's')
14999 *p_str = PyObject_Str(v);
15000 else if (arg->ch == 'r')
15001 *p_str = PyObject_Repr(v);
15002 else
15003 *p_str = PyObject_ASCII(v);
15004 }
15005 break;
15006
15007 case 'i':
15008 case 'd':
15009 case 'u':
15010 case 'o':
15011 case 'x':
15012 case 'X':
15013 {
15014 int ret = mainformatlong(v, arg, p_str, writer);
15015 if (ret != 0)
15016 return ret;
15017 arg->sign = 1;
15018 break;
15019 }
15020
15021 case 'e':
15022 case 'E':
15023 case 'f':
15024 case 'F':
15025 case 'g':
15026 case 'G':
15027 if (arg->width == -1 && arg->prec == -1
15028 && !(arg->flags & (F_SIGN | F_BLANK)))
15029 {
15030 /* Fast path */
15031 if (formatfloat(v, arg, NULL, writer) == -1)
15032 return -1;
15033 return 1;
15034 }
15035
15036 arg->sign = 1;
15037 if (formatfloat(v, arg, p_str, NULL) == -1)
15038 return -1;
15039 break;
15040
15041 case 'c':
15042 {
15043 Py_UCS4 ch = formatchar(v);
15044 if (ch == (Py_UCS4) -1)
15045 return -1;
15046 if (arg->width == -1 && arg->prec == -1) {
15047 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015048 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015049 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015050 return 1;
15051 }
15052 *p_str = PyUnicode_FromOrdinal(ch);
15053 break;
15054 }
15055
15056 default:
15057 PyErr_Format(PyExc_ValueError,
15058 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015059 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015060 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15061 (int)arg->ch,
15062 ctx->fmtpos - 1);
15063 return -1;
15064 }
15065 if (*p_str == NULL)
15066 return -1;
15067 assert (PyUnicode_Check(*p_str));
15068 return 0;
15069}
15070
15071static int
15072unicode_format_arg_output(struct unicode_formatter_t *ctx,
15073 struct unicode_format_arg_t *arg,
15074 PyObject *str)
15075{
15076 Py_ssize_t len;
15077 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015078 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015079 Py_ssize_t pindex;
15080 Py_UCS4 signchar;
15081 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015082 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015083 Py_ssize_t sublen;
15084 _PyUnicodeWriter *writer = &ctx->writer;
15085 Py_UCS4 fill;
15086
15087 fill = ' ';
15088 if (arg->sign && arg->flags & F_ZERO)
15089 fill = '0';
15090
15091 if (PyUnicode_READY(str) == -1)
15092 return -1;
15093
15094 len = PyUnicode_GET_LENGTH(str);
15095 if ((arg->width == -1 || arg->width <= len)
15096 && (arg->prec == -1 || arg->prec >= len)
15097 && !(arg->flags & (F_SIGN | F_BLANK)))
15098 {
15099 /* Fast path */
15100 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15101 return -1;
15102 return 0;
15103 }
15104
15105 /* Truncate the string for "s", "r" and "a" formats
15106 if the precision is set */
15107 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15108 if (arg->prec >= 0 && len > arg->prec)
15109 len = arg->prec;
15110 }
15111
15112 /* Adjust sign and width */
15113 kind = PyUnicode_KIND(str);
15114 pbuf = PyUnicode_DATA(str);
15115 pindex = 0;
15116 signchar = '\0';
15117 if (arg->sign) {
15118 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15119 if (ch == '-' || ch == '+') {
15120 signchar = ch;
15121 len--;
15122 pindex++;
15123 }
15124 else if (arg->flags & F_SIGN)
15125 signchar = '+';
15126 else if (arg->flags & F_BLANK)
15127 signchar = ' ';
15128 else
15129 arg->sign = 0;
15130 }
15131 if (arg->width < len)
15132 arg->width = len;
15133
15134 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015135 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015136 if (!(arg->flags & F_LJUST)) {
15137 if (arg->sign) {
15138 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015139 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015140 }
15141 else {
15142 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015143 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015144 }
15145 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015146 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15147 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015148 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015149 }
15150
Victor Stinnera47082312012-10-04 02:19:54 +020015151 buflen = arg->width;
15152 if (arg->sign && len == arg->width)
15153 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015154 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015155 return -1;
15156
15157 /* Write the sign if needed */
15158 if (arg->sign) {
15159 if (fill != ' ') {
15160 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15161 writer->pos += 1;
15162 }
15163 if (arg->width > len)
15164 arg->width--;
15165 }
15166
15167 /* Write the numeric prefix for "x", "X" and "o" formats
15168 if the alternate form is used.
15169 For example, write "0x" for the "%#x" format. */
15170 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15171 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15172 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15173 if (fill != ' ') {
15174 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15175 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15176 writer->pos += 2;
15177 pindex += 2;
15178 }
15179 arg->width -= 2;
15180 if (arg->width < 0)
15181 arg->width = 0;
15182 len -= 2;
15183 }
15184
15185 /* Pad left with the fill character if needed */
15186 if (arg->width > len && !(arg->flags & F_LJUST)) {
15187 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015188 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015189 writer->pos += sublen;
15190 arg->width = len;
15191 }
15192
15193 /* If padding with spaces: write sign if needed and/or numeric prefix if
15194 the alternate form is used */
15195 if (fill == ' ') {
15196 if (arg->sign) {
15197 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15198 writer->pos += 1;
15199 }
15200 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15201 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15202 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15203 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15204 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15205 writer->pos += 2;
15206 pindex += 2;
15207 }
15208 }
15209
15210 /* Write characters */
15211 if (len) {
15212 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15213 str, pindex, len);
15214 writer->pos += len;
15215 }
15216
15217 /* Pad right with the fill character if needed */
15218 if (arg->width > len) {
15219 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015220 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015221 writer->pos += sublen;
15222 }
15223 return 0;
15224}
15225
15226/* Helper of PyUnicode_Format(): format one arg.
15227 Return 0 on success, raise an exception and return -1 on error. */
15228static int
15229unicode_format_arg(struct unicode_formatter_t *ctx)
15230{
15231 struct unicode_format_arg_t arg;
15232 PyObject *str;
15233 int ret;
15234
Victor Stinner8dbd4212012-12-04 09:30:24 +010015235 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015236 if (arg.ch == '%') {
15237 ctx->fmtpos++;
15238 ctx->fmtcnt--;
15239 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15240 return -1;
15241 return 0;
15242 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015243 arg.flags = 0;
15244 arg.width = -1;
15245 arg.prec = -1;
15246 arg.sign = 0;
15247 str = NULL;
15248
Victor Stinnera47082312012-10-04 02:19:54 +020015249 ret = unicode_format_arg_parse(ctx, &arg);
15250 if (ret == -1)
15251 return -1;
15252
15253 ret = unicode_format_arg_format(ctx, &arg, &str);
15254 if (ret == -1)
15255 return -1;
15256
15257 if (ret != 1) {
15258 ret = unicode_format_arg_output(ctx, &arg, str);
15259 Py_DECREF(str);
15260 if (ret == -1)
15261 return -1;
15262 }
15263
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015264 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015265 PyErr_SetString(PyExc_TypeError,
15266 "not all arguments converted during string formatting");
15267 return -1;
15268 }
15269 return 0;
15270}
15271
Alexander Belopolsky40018472011-02-26 01:02:56 +000015272PyObject *
15273PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015274{
Victor Stinnera47082312012-10-04 02:19:54 +020015275 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015276
Guido van Rossumd57fd912000-03-10 22:53:23 +000015277 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015278 PyErr_BadInternalCall();
15279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015280 }
Victor Stinnera47082312012-10-04 02:19:54 +020015281
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015282 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015283 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015284
15285 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015286 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15287 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15288 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15289 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015290
Victor Stinner8f674cc2013-04-17 23:02:17 +020015291 _PyUnicodeWriter_Init(&ctx.writer);
15292 ctx.writer.min_length = ctx.fmtcnt + 100;
15293 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015294
Guido van Rossumd57fd912000-03-10 22:53:23 +000015295 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015296 ctx.arglen = PyTuple_Size(args);
15297 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015298 }
15299 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015300 ctx.arglen = -1;
15301 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015302 }
Victor Stinnera47082312012-10-04 02:19:54 +020015303 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015304 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015305 ctx.dict = args;
15306 else
15307 ctx.dict = NULL;
15308 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015309
Victor Stinnera47082312012-10-04 02:19:54 +020015310 while (--ctx.fmtcnt >= 0) {
15311 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015312 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015313
15314 nonfmtpos = ctx.fmtpos++;
15315 while (ctx.fmtcnt >= 0 &&
15316 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15317 ctx.fmtpos++;
15318 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015319 }
Victor Stinnera47082312012-10-04 02:19:54 +020015320 if (ctx.fmtcnt < 0) {
15321 ctx.fmtpos--;
15322 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015323 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015324
Victor Stinnercfc4c132013-04-03 01:48:39 +020015325 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15326 nonfmtpos, ctx.fmtpos) < 0)
15327 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 }
15329 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015330 ctx.fmtpos++;
15331 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015332 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015333 }
15334 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015335
Victor Stinnera47082312012-10-04 02:19:54 +020015336 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015337 PyErr_SetString(PyExc_TypeError,
15338 "not all arguments converted during string formatting");
15339 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015340 }
15341
Victor Stinnera47082312012-10-04 02:19:54 +020015342 if (ctx.args_owned) {
15343 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015344 }
Victor Stinnera47082312012-10-04 02:19:54 +020015345 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015346
Benjamin Peterson29060642009-01-31 22:14:21 +000015347 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015348 _PyUnicodeWriter_Dealloc(&ctx.writer);
15349 if (ctx.args_owned) {
15350 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015351 }
15352 return NULL;
15353}
15354
Jeremy Hylton938ace62002-07-17 16:30:39 +000015355static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015356unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15357
Tim Peters6d6c1a32001-08-02 04:15:00 +000015358static PyObject *
15359unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15360{
Benjamin Peterson29060642009-01-31 22:14:21 +000015361 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015362 static char *kwlist[] = {"object", "encoding", "errors", 0};
15363 char *encoding = NULL;
15364 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015365
Benjamin Peterson14339b62009-01-31 16:36:08 +000015366 if (type != &PyUnicode_Type)
15367 return unicode_subtype_new(type, args, kwds);
15368 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015369 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015370 return NULL;
15371 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015372 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 if (encoding == NULL && errors == NULL)
15374 return PyObject_Str(x);
15375 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015376 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015377}
15378
Guido van Rossume023fe02001-08-30 03:12:59 +000015379static PyObject *
15380unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15381{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015382 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015383 Py_ssize_t length, char_size;
15384 int share_wstr, share_utf8;
15385 unsigned int kind;
15386 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015387
Benjamin Peterson14339b62009-01-31 16:36:08 +000015388 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015389
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015390 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015391 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015392 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015393 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015394 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015395 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015396 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015397 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015398
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015399 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015400 if (self == NULL) {
15401 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 return NULL;
15403 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015404 kind = PyUnicode_KIND(unicode);
15405 length = PyUnicode_GET_LENGTH(unicode);
15406
15407 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015408#ifdef Py_DEBUG
15409 _PyUnicode_HASH(self) = -1;
15410#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015411 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015412#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015413 _PyUnicode_STATE(self).interned = 0;
15414 _PyUnicode_STATE(self).kind = kind;
15415 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015416 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015417 _PyUnicode_STATE(self).ready = 1;
15418 _PyUnicode_WSTR(self) = NULL;
15419 _PyUnicode_UTF8_LENGTH(self) = 0;
15420 _PyUnicode_UTF8(self) = NULL;
15421 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015422 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015423
15424 share_utf8 = 0;
15425 share_wstr = 0;
15426 if (kind == PyUnicode_1BYTE_KIND) {
15427 char_size = 1;
15428 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15429 share_utf8 = 1;
15430 }
15431 else if (kind == PyUnicode_2BYTE_KIND) {
15432 char_size = 2;
15433 if (sizeof(wchar_t) == 2)
15434 share_wstr = 1;
15435 }
15436 else {
15437 assert(kind == PyUnicode_4BYTE_KIND);
15438 char_size = 4;
15439 if (sizeof(wchar_t) == 4)
15440 share_wstr = 1;
15441 }
15442
15443 /* Ensure we won't overflow the length. */
15444 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15445 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015446 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015447 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015448 data = PyObject_MALLOC((length + 1) * char_size);
15449 if (data == NULL) {
15450 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015451 goto onError;
15452 }
15453
Victor Stinnerc3c74152011-10-02 20:39:55 +020015454 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015455 if (share_utf8) {
15456 _PyUnicode_UTF8_LENGTH(self) = length;
15457 _PyUnicode_UTF8(self) = data;
15458 }
15459 if (share_wstr) {
15460 _PyUnicode_WSTR_LENGTH(self) = length;
15461 _PyUnicode_WSTR(self) = (wchar_t *)data;
15462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015463
Christian Heimesf051e432016-09-13 20:22:02 +020015464 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015465 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015466 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015467#ifdef Py_DEBUG
15468 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15469#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015470 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015471 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015472
15473onError:
15474 Py_DECREF(unicode);
15475 Py_DECREF(self);
15476 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015477}
15478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015479PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015480"str(object='') -> str\n\
15481str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015482\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015483Create a new string object from the given object. If encoding or\n\
15484errors is specified, then the object must expose a data buffer\n\
15485that will be decoded using the given encoding and error handler.\n\
15486Otherwise, returns the result of object.__str__() (if defined)\n\
15487or repr(object).\n\
15488encoding defaults to sys.getdefaultencoding().\n\
15489errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015490
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015491static PyObject *unicode_iter(PyObject *seq);
15492
Guido van Rossumd57fd912000-03-10 22:53:23 +000015493PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015494 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015495 "str", /* tp_name */
15496 sizeof(PyUnicodeObject), /* tp_basicsize */
15497 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015498 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015499 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015500 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015501 0, /* tp_getattr */
15502 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015503 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015504 unicode_repr, /* tp_repr */
15505 &unicode_as_number, /* tp_as_number */
15506 &unicode_as_sequence, /* tp_as_sequence */
15507 &unicode_as_mapping, /* tp_as_mapping */
15508 (hashfunc) unicode_hash, /* tp_hash*/
15509 0, /* tp_call*/
15510 (reprfunc) unicode_str, /* tp_str */
15511 PyObject_GenericGetAttr, /* tp_getattro */
15512 0, /* tp_setattro */
15513 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015514 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015515 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15516 unicode_doc, /* tp_doc */
15517 0, /* tp_traverse */
15518 0, /* tp_clear */
15519 PyUnicode_RichCompare, /* tp_richcompare */
15520 0, /* tp_weaklistoffset */
15521 unicode_iter, /* tp_iter */
15522 0, /* tp_iternext */
15523 unicode_methods, /* tp_methods */
15524 0, /* tp_members */
15525 0, /* tp_getset */
15526 &PyBaseObject_Type, /* tp_base */
15527 0, /* tp_dict */
15528 0, /* tp_descr_get */
15529 0, /* tp_descr_set */
15530 0, /* tp_dictoffset */
15531 0, /* tp_init */
15532 0, /* tp_alloc */
15533 unicode_new, /* tp_new */
15534 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015535};
15536
15537/* Initialize the Unicode implementation */
15538
Victor Stinner331a6a52019-05-27 16:39:22 +020015539PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015540_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015541{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015542 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015543 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015544 0x000A, /* LINE FEED */
15545 0x000D, /* CARRIAGE RETURN */
15546 0x001C, /* FILE SEPARATOR */
15547 0x001D, /* GROUP SEPARATOR */
15548 0x001E, /* RECORD SEPARATOR */
15549 0x0085, /* NEXT LINE */
15550 0x2028, /* LINE SEPARATOR */
15551 0x2029, /* PARAGRAPH SEPARATOR */
15552 };
15553
Victor Stinner90ed8a62020-06-24 00:34:07 +020015554 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
15555 // optimized to always use state->empty without having to check if it is
15556 // NULL or not.
15557 PyObject *empty = PyUnicode_New(1, 0);
15558 if (empty == NULL) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015559 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015560 }
Victor Stinner90ed8a62020-06-24 00:34:07 +020015561 PyUnicode_1BYTE_DATA(empty)[0] = 0;
15562 _PyUnicode_LENGTH(empty) = 0;
15563 assert(_PyUnicode_CheckConsistency(empty, 1));
15564
15565 struct _Py_unicode_state *state = &tstate->interp->unicode;
15566 assert(state->empty == NULL);
15567 state->empty = empty;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015568
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015569 if (_Py_IsMainInterpreter(tstate)) {
15570 /* initialize the linebreak bloom filter */
15571 bloom_linebreak = make_bloom_mask(
15572 PyUnicode_2BYTE_KIND, linebreak,
15573 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015574
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015575 if (PyType_Ready(&PyUnicode_Type) < 0) {
15576 return _PyStatus_ERR("Can't initialize unicode type");
15577 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015578
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015579 if (PyType_Ready(&EncodingMapType) < 0) {
15580 return _PyStatus_ERR("Can't initialize encoding map type");
15581 }
15582 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15583 return _PyStatus_ERR("Can't initialize field name iterator type");
15584 }
15585 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15586 return _PyStatus_ERR("Can't initialize formatter iter type");
15587 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015588 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015589 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015590}
15591
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015592
Walter Dörwald16807132007-05-25 13:52:07 +000015593void
15594PyUnicode_InternInPlace(PyObject **p)
15595{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015596 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015597#ifdef Py_DEBUG
15598 assert(s != NULL);
15599 assert(_PyUnicode_CHECK(s));
15600#else
Victor Stinner607b1022020-05-05 18:50:30 +020015601 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015602 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015603 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015604#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015605
Benjamin Peterson14339b62009-01-31 16:36:08 +000015606 /* If it's a subclass, we don't really know what putting
15607 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015608 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015609 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015610 }
15611
15612 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015613 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015614 }
15615
15616#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015617 if (interned == NULL) {
15618 interned = PyDict_New();
15619 if (interned == NULL) {
15620 PyErr_Clear(); /* Don't leave an exception */
15621 return;
15622 }
15623 }
Victor Stinner607b1022020-05-05 18:50:30 +020015624
15625 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015626 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015627 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015628 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015629
Berker Peksagced8d4c2016-07-25 04:40:39 +030015630 if (t == NULL) {
15631 PyErr_Clear();
15632 return;
15633 }
Victor Stinner607b1022020-05-05 18:50:30 +020015634
Berker Peksagced8d4c2016-07-25 04:40:39 +030015635 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015636 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015637 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015638 return;
15639 }
Victor Stinner607b1022020-05-05 18:50:30 +020015640
Benjamin Peterson14339b62009-01-31 16:36:08 +000015641 /* The two references in interned are not counted by refcnt.
15642 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015643 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015644 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015645#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015646}
15647
15648void
15649PyUnicode_InternImmortal(PyObject **p)
15650{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015651 PyUnicode_InternInPlace(p);
15652 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015653 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015654 Py_INCREF(*p);
15655 }
Walter Dörwald16807132007-05-25 13:52:07 +000015656}
15657
15658PyObject *
15659PyUnicode_InternFromString(const char *cp)
15660{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015661 PyObject *s = PyUnicode_FromString(cp);
15662 if (s == NULL)
15663 return NULL;
15664 PyUnicode_InternInPlace(&s);
15665 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015666}
15667
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015668
15669#if defined(WITH_VALGRIND) || defined(__INSURE__)
15670static void
15671unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015672{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015673 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015674 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015675 }
15676 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015677 if (keys == NULL || !PyList_Check(keys)) {
15678 PyErr_Clear();
15679 return;
15680 }
Walter Dörwald16807132007-05-25 13:52:07 +000015681
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015682 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015683 detector, interned unicode strings are not forcibly deallocated;
15684 rather, we give them their stolen references back, and then clear
15685 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015686
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015687 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015688#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015689 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015690
15691 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015692#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015693 for (Py_ssize_t i = 0; i < n; i++) {
15694 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015695 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015696 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015698 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015699 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015700 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015701#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015702 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015703#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015704 break;
15705 case SSTATE_INTERNED_MORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015706 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015707#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015708 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015709#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015710 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015711 case SSTATE_NOT_INTERNED:
15712 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015713 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015714 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015715 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015716 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015717 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015718#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015719 fprintf(stderr,
15720 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15721 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015722#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015723 Py_DECREF(keys);
15724 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015725 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015726}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015727#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015728
15729
15730/********************* Unicode Iterator **************************/
15731
15732typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015733 PyObject_HEAD
15734 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015735 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015736} unicodeiterobject;
15737
15738static void
15739unicodeiter_dealloc(unicodeiterobject *it)
15740{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015741 _PyObject_GC_UNTRACK(it);
15742 Py_XDECREF(it->it_seq);
15743 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015744}
15745
15746static int
15747unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15748{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015749 Py_VISIT(it->it_seq);
15750 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015751}
15752
15753static PyObject *
15754unicodeiter_next(unicodeiterobject *it)
15755{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015756 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015757
Benjamin Peterson14339b62009-01-31 16:36:08 +000015758 assert(it != NULL);
15759 seq = it->it_seq;
15760 if (seq == NULL)
15761 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015762 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015764 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15765 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015766 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015767 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15768 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015769 if (item != NULL)
15770 ++it->it_index;
15771 return item;
15772 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015773
Benjamin Peterson14339b62009-01-31 16:36:08 +000015774 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015775 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015776 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015777}
15778
15779static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015780unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015781{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015782 Py_ssize_t len = 0;
15783 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015784 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015785 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015786}
15787
15788PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15789
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015790static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015791unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015792{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015793 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015794 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015795 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015796 it->it_seq, it->it_index);
15797 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015798 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015799 if (u == NULL)
15800 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015801 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015802 }
15803}
15804
15805PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15806
15807static PyObject *
15808unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15809{
15810 Py_ssize_t index = PyLong_AsSsize_t(state);
15811 if (index == -1 && PyErr_Occurred())
15812 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015813 if (it->it_seq != NULL) {
15814 if (index < 0)
15815 index = 0;
15816 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15817 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15818 it->it_index = index;
15819 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015820 Py_RETURN_NONE;
15821}
15822
15823PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15824
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015825static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015826 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015827 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015828 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15829 reduce_doc},
15830 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15831 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015832 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015833};
15834
15835PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015836 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15837 "str_iterator", /* tp_name */
15838 sizeof(unicodeiterobject), /* tp_basicsize */
15839 0, /* tp_itemsize */
15840 /* methods */
15841 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015842 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015843 0, /* tp_getattr */
15844 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015845 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015846 0, /* tp_repr */
15847 0, /* tp_as_number */
15848 0, /* tp_as_sequence */
15849 0, /* tp_as_mapping */
15850 0, /* tp_hash */
15851 0, /* tp_call */
15852 0, /* tp_str */
15853 PyObject_GenericGetAttr, /* tp_getattro */
15854 0, /* tp_setattro */
15855 0, /* tp_as_buffer */
15856 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15857 0, /* tp_doc */
15858 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15859 0, /* tp_clear */
15860 0, /* tp_richcompare */
15861 0, /* tp_weaklistoffset */
15862 PyObject_SelfIter, /* tp_iter */
15863 (iternextfunc)unicodeiter_next, /* tp_iternext */
15864 unicodeiter_methods, /* tp_methods */
15865 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015866};
15867
15868static PyObject *
15869unicode_iter(PyObject *seq)
15870{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015871 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015872
Benjamin Peterson14339b62009-01-31 16:36:08 +000015873 if (!PyUnicode_Check(seq)) {
15874 PyErr_BadInternalCall();
15875 return NULL;
15876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015877 if (PyUnicode_READY(seq) == -1)
15878 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015879 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15880 if (it == NULL)
15881 return NULL;
15882 it->it_index = 0;
15883 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015884 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015885 _PyObject_GC_TRACK(it);
15886 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015887}
15888
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015889
15890size_t
15891Py_UNICODE_strlen(const Py_UNICODE *u)
15892{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015893 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015894}
15895
15896Py_UNICODE*
15897Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15898{
15899 Py_UNICODE *u = s1;
15900 while ((*u++ = *s2++));
15901 return s1;
15902}
15903
15904Py_UNICODE*
15905Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15906{
15907 Py_UNICODE *u = s1;
15908 while ((*u++ = *s2++))
15909 if (n-- == 0)
15910 break;
15911 return s1;
15912}
15913
15914Py_UNICODE*
15915Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15916{
15917 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015918 u1 += wcslen(u1);
15919 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015920 return s1;
15921}
15922
15923int
15924Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15925{
15926 while (*s1 && *s2 && *s1 == *s2)
15927 s1++, s2++;
15928 if (*s1 && *s2)
15929 return (*s1 < *s2) ? -1 : +1;
15930 if (*s1)
15931 return 1;
15932 if (*s2)
15933 return -1;
15934 return 0;
15935}
15936
15937int
15938Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15939{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015940 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015941 for (; n != 0; n--) {
15942 u1 = *s1;
15943 u2 = *s2;
15944 if (u1 != u2)
15945 return (u1 < u2) ? -1 : +1;
15946 if (u1 == '\0')
15947 return 0;
15948 s1++;
15949 s2++;
15950 }
15951 return 0;
15952}
15953
15954Py_UNICODE*
15955Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15956{
15957 const Py_UNICODE *p;
15958 for (p = s; *p; p++)
15959 if (*p == c)
15960 return (Py_UNICODE*)p;
15961 return NULL;
15962}
15963
15964Py_UNICODE*
15965Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15966{
15967 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015968 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015969 while (p != s) {
15970 p--;
15971 if (*p == c)
15972 return (Py_UNICODE*)p;
15973 }
15974 return NULL;
15975}
Victor Stinner331ea922010-08-10 16:37:20 +000015976
Victor Stinner71133ff2010-09-01 23:43:53 +000015977Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015978PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015979{
Victor Stinner577db2c2011-10-11 22:12:48 +020015980 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015981 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015983 if (!PyUnicode_Check(unicode)) {
15984 PyErr_BadArgument();
15985 return NULL;
15986 }
Inada Naoki2c4928d2020-06-17 20:09:44 +090015987_Py_COMP_DIAG_PUSH
15988_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015989 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Inada Naoki2c4928d2020-06-17 20:09:44 +090015990_Py_COMP_DIAG_POP
Victor Stinner577db2c2011-10-11 22:12:48 +020015991 if (u == NULL)
15992 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015993 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015994 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015995 PyErr_NoMemory();
15996 return NULL;
15997 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015998 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015999 size *= sizeof(Py_UNICODE);
16000 copy = PyMem_Malloc(size);
16001 if (copy == NULL) {
16002 PyErr_NoMemory();
16003 return NULL;
16004 }
Victor Stinner577db2c2011-10-11 22:12:48 +020016005 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000016006 return copy;
16007}
Martin v. Löwis5b222132007-06-10 09:51:05 +000016008
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016009
Victor Stinner709d23d2019-05-02 14:56:30 -040016010static int
16011encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016012{
Victor Stinner709d23d2019-05-02 14:56:30 -040016013 int res;
16014 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16015 if (res == -2) {
16016 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16017 return -1;
16018 }
16019 if (res < 0) {
16020 PyErr_NoMemory();
16021 return -1;
16022 }
16023 return 0;
16024}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016025
Victor Stinner709d23d2019-05-02 14:56:30 -040016026
16027static int
16028config_get_codec_name(wchar_t **config_encoding)
16029{
16030 char *encoding;
16031 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16032 return -1;
16033 }
16034
16035 PyObject *name_obj = NULL;
16036 PyObject *codec = _PyCodec_Lookup(encoding);
16037 PyMem_RawFree(encoding);
16038
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016039 if (!codec)
16040 goto error;
16041
16042 name_obj = PyObject_GetAttrString(codec, "name");
16043 Py_CLEAR(codec);
16044 if (!name_obj) {
16045 goto error;
16046 }
16047
Victor Stinner709d23d2019-05-02 14:56:30 -040016048 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16049 Py_DECREF(name_obj);
16050 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016051 goto error;
16052 }
16053
Victor Stinner709d23d2019-05-02 14:56:30 -040016054 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16055 if (raw_wname == NULL) {
16056 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016057 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016058 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016059 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016060
16061 PyMem_RawFree(*config_encoding);
16062 *config_encoding = raw_wname;
16063
16064 PyMem_Free(wname);
16065 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016066
16067error:
16068 Py_XDECREF(codec);
16069 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016070 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016071}
16072
16073
Victor Stinner331a6a52019-05-27 16:39:22 +020016074static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016075init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016076{
Victor Stinner709d23d2019-05-02 14:56:30 -040016077 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016078 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016079 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016080 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016081 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016082 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016083 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016084}
16085
16086
Victor Stinner709d23d2019-05-02 14:56:30 -040016087static int
16088init_fs_codec(PyInterpreterState *interp)
16089{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016090 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016091
16092 _Py_error_handler error_handler;
16093 error_handler = get_error_handler_wide(config->filesystem_errors);
16094 if (error_handler == _Py_ERROR_UNKNOWN) {
16095 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16096 return -1;
16097 }
16098
16099 char *encoding, *errors;
16100 if (encode_wstr_utf8(config->filesystem_encoding,
16101 &encoding,
16102 "filesystem_encoding") < 0) {
16103 return -1;
16104 }
16105
16106 if (encode_wstr_utf8(config->filesystem_errors,
16107 &errors,
16108 "filesystem_errors") < 0) {
16109 PyMem_RawFree(encoding);
16110 return -1;
16111 }
16112
Victor Stinner3d17c042020-05-14 01:48:38 +020016113 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16114 PyMem_RawFree(fs_codec->encoding);
16115 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016116 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016117 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16118 PyMem_RawFree(fs_codec->errors);
16119 fs_codec->errors = errors;
16120 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016121
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016122#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016123 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016124#endif
16125
Victor Stinner709d23d2019-05-02 14:56:30 -040016126 /* At this point, PyUnicode_EncodeFSDefault() and
16127 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16128 the C implementation of the filesystem encoding. */
16129
16130 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16131 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016132 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16133 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016134 PyErr_NoMemory();
16135 return -1;
16136 }
16137 return 0;
16138}
16139
16140
Victor Stinner331a6a52019-05-27 16:39:22 +020016141static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016142init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016143{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016144 PyInterpreterState *interp = tstate->interp;
16145
Victor Stinner709d23d2019-05-02 14:56:30 -040016146 /* Update the filesystem encoding to the normalized Python codec name.
16147 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16148 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016149 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016150 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016151 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016152 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016153 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016154 }
16155
Victor Stinner709d23d2019-05-02 14:56:30 -040016156 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016157 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016158 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016159 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016160}
16161
16162
Victor Stinner331a6a52019-05-27 16:39:22 +020016163PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016164_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016165{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016166 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016167 if (_PyStatus_EXCEPTION(status)) {
16168 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016169 }
16170
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016171 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016172}
16173
16174
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016175static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016176_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016177{
Victor Stinner3d17c042020-05-14 01:48:38 +020016178 PyMem_RawFree(fs_codec->encoding);
16179 fs_codec->encoding = NULL;
16180 fs_codec->utf8 = 0;
16181 PyMem_RawFree(fs_codec->errors);
16182 fs_codec->errors = NULL;
16183 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016184}
16185
16186
Victor Stinner709d23d2019-05-02 14:56:30 -040016187#ifdef MS_WINDOWS
16188int
16189_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16190{
Victor Stinner81a7be32020-04-14 15:14:01 +020016191 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016192 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016193
16194 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16195 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16196 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16197 if (encoding == NULL || errors == NULL) {
16198 PyMem_RawFree(encoding);
16199 PyMem_RawFree(errors);
16200 PyErr_NoMemory();
16201 return -1;
16202 }
16203
16204 PyMem_RawFree(config->filesystem_encoding);
16205 config->filesystem_encoding = encoding;
16206 PyMem_RawFree(config->filesystem_errors);
16207 config->filesystem_errors = errors;
16208
16209 return init_fs_codec(interp);
16210}
16211#endif
16212
16213
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016214void
Victor Stinner3d483342019-11-22 12:27:50 +010016215_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016216{
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016217 struct _Py_unicode_state *state = &tstate->interp->unicode;
16218
16219 int is_main_interp = _Py_IsMainInterpreter(tstate);
16220 if (is_main_interp) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016221#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016222 /* Insure++ is a memory analysis tool that aids in discovering
16223 * memory leaks and other memory problems. On Python exit, the
16224 * interned string dictionaries are flagged as being in use at exit
16225 * (which it is). Under normal circumstances, this is fine because
16226 * the memory will be automatically reclaimed by the system. Under
16227 * memory debugging, it's a huge source of useless noise, so we
16228 * trade off slower shutdown for less distraction in the memory
16229 * reports. -baw
16230 */
16231 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016232#endif /* __INSURE__ */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016233 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016234
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016235 Py_CLEAR(state->empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016236
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016237 if (is_main_interp) {
Victor Stinner607b1022020-05-05 18:50:30 +020016238#ifdef LATIN1_SINGLETONS
Victor Stinner3d483342019-11-22 12:27:50 +010016239 for (Py_ssize_t i = 0; i < 256; i++) {
16240 Py_CLEAR(unicode_latin1[i]);
16241 }
Victor Stinner607b1022020-05-05 18:50:30 +020016242#endif
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016243 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016244 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016245
Victor Stinner3d17c042020-05-14 01:48:38 +020016246 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016247}
16248
16249
Georg Brandl66c221e2010-10-14 07:04:07 +000016250/* A _string module, to export formatter_parser and formatter_field_name_split
16251 to the string.Formatter class implemented in Python. */
16252
16253static PyMethodDef _string_methods[] = {
16254 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16255 METH_O, PyDoc_STR("split the argument as a field name")},
16256 {"formatter_parser", (PyCFunction) formatter_parser,
16257 METH_O, PyDoc_STR("parse the argument as a format string")},
16258 {NULL, NULL}
16259};
16260
16261static struct PyModuleDef _string_module = {
16262 PyModuleDef_HEAD_INIT,
16263 "_string",
16264 PyDoc_STR("string helper module"),
16265 0,
16266 _string_methods,
16267 NULL,
16268 NULL,
16269 NULL,
16270 NULL
16271};
16272
16273PyMODINIT_FUNC
16274PyInit__string(void)
16275{
16276 return PyModule_Create(&_string_module);
16277}
16278
16279
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016280#ifdef __cplusplus
16281}
16282#endif