blob: c75eb077e0c80d5e0712d890dcef02e362b9d86d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020047#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinnerbcda8f12018-11-21 22:27:47 +010048#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020049#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040050#include "pycore_pylifecycle.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020051#include "pycore_pystate.h" // _PyInterpreterState_GET()
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000052#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070053#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000055#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000056#include <windows.h>
57#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000058
Victor Stinnerfecc4f22019-03-19 14:20:29 +010059/* Uncomment to display statistics on interned strings at exit when
60 using Valgrind or Insecure++. */
61/* #define INTERNED_STATS 1 */
62
63
Larry Hastings61272b72014-01-07 12:41:53 -080064/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090065class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080066[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090067/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69/*[python input]
70class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80[python start generated code]*/
81/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080082
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchaka05997252013-01-26 12:14:02 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner8faf8212011-12-08 22:14:11 +010096/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97#define MAX_UNICODE 0x10ffff
98
Victor Stinner910337b2011-10-03 03:20:16 +020099#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200100# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#else
102# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200104
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105#define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200108 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200113#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200116 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200121#define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
123#define _PyUnicode_WSTR_LENGTH(op) \
124 (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) \
126 (((PyASCIIObject *)(op))->length)
127#define _PyUnicode_STATE(op) \
128 (((PyASCIIObject *)(op))->state)
129#define _PyUnicode_HASH(op) \
130 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_KIND(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200134#define _PyUnicode_GET_LENGTH(op) \
135 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200137#define _PyUnicode_DATA_ANY(op) \
138 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139
Victor Stinner910337b2011-10-03 03:20:16 +0200140#undef PyUnicode_READY
141#define PyUnicode_READY(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200144 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100145 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200146
Victor Stinnerc379ead2011-10-03 12:52:27 +0200147#define _PyUnicode_SHARE_UTF8(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
150 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
151#define _PyUnicode_SHARE_WSTR(op) \
152 (assert(_PyUnicode_CHECK(op)), \
153 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
154
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155/* true if the Unicode object has an allocated UTF-8 memory block
156 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200157#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200158 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200159 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200160 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
161
Victor Stinner03490912011-10-03 23:45:12 +0200162/* true if the Unicode object has an allocated wstr memory block
163 (not shared with other data) */
164#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200165 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100176 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600177 const from_type *_iter = (const from_type *)(begin);\
178 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 Py_ssize_t n = (_end) - (_iter); \
180 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200181 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200182 while (_iter < (_unrolled_end)) { \
183 _to[0] = (to_type) _iter[0]; \
184 _to[1] = (to_type) _iter[1]; \
185 _to[2] = (to_type) _iter[2]; \
186 _to[3] = (to_type) _iter[3]; \
187 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200189 while (_iter < (_end)) \
190 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200191 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200192
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200193#ifdef MS_WINDOWS
194 /* On Windows, overallocate by 50% is the best factor */
195# define OVERALLOCATE_FACTOR 2
196#else
197 /* On Linux, overallocate by 25% is the best factor */
198# define OVERALLOCATE_FACTOR 4
199#endif
200
Victor Stinner607b1022020-05-05 18:50:30 +0200201/* bpo-40521: Interned strings are shared by all interpreters. */
202#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
203# define INTERNED_STRINGS
204#endif
205
Walter Dörwald16807132007-05-25 13:52:07 +0000206/* This dictionary holds all interned unicode strings. Note that references
207 to strings in this dictionary are *not* counted in the string's ob_refcnt.
208 When the interned string reaches a refcnt of 0 the string deallocation
209 function will delete the reference from this dictionary.
210
211 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000212 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000213*/
Victor Stinner607b1022020-05-05 18:50:30 +0200214#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200216#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000217
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000218/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200220
Serhiy Storchaka678db842013-01-26 12:16:36 +0200221#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 do { \
223 if (unicode_empty != NULL) \
224 Py_INCREF(unicode_empty); \
225 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226 unicode_empty = PyUnicode_New(0, 0); \
227 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200228 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
230 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200231 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200232 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000233
Serhiy Storchaka678db842013-01-26 12:16:36 +0200234#define _Py_RETURN_UNICODE_EMPTY() \
235 do { \
236 _Py_INCREF_UNICODE_EMPTY(); \
237 return unicode_empty; \
238 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000239
Victor Stinner59423e32018-11-26 13:40:01 +0100240static inline void
241unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
242 Py_ssize_t start, Py_ssize_t length)
243{
244 assert(0 <= start);
245 assert(kind != PyUnicode_WCHAR_KIND);
246 switch (kind) {
247 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100248 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100249 Py_UCS1 ch = (unsigned char)value;
250 Py_UCS1 *to = (Py_UCS1 *)data + start;
251 memset(to, ch, length);
252 break;
253 }
254 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS2 ch = (Py_UCS2)value;
257 Py_UCS2 *to = (Py_UCS2 *)data + start;
258 const Py_UCS2 *end = to + length;
259 for (; to < end; ++to) *to = ch;
260 break;
261 }
262 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100263 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100264 Py_UCS4 ch = value;
265 Py_UCS4 * to = (Py_UCS4 *)data + start;
266 const Py_UCS4 *end = to + length;
267 for (; to < end; ++to) *to = ch;
268 break;
269 }
270 default: Py_UNREACHABLE();
271 }
272}
273
274
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200275/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700276static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900278static inline void
279_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400280static PyObject *
281unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
282 const char *errors);
283static PyObject *
284unicode_decode_utf8(const char *s, Py_ssize_t size,
285 _Py_error_handler error_handler, const char *errors,
286 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200287
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200288/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200289static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200290
Victor Stinner607b1022020-05-05 18:50:30 +0200291/* bpo-40521: Latin1 singletons are shared by all interpreters. */
292#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
293# define LATIN1_SINGLETONS
294#endif
295
296#ifdef LATIN1_SINGLETONS
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297/* Single character Unicode strings in the Latin-1 range are being
298 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200299static PyObject *unicode_latin1[256] = {NULL};
Victor Stinner607b1022020-05-05 18:50:30 +0200300#endif
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000301
Christian Heimes190d79e2008-01-30 11:58:22 +0000302/* Fast detection of the most frequent whitespace characters */
303const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000305/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000306/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000307/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000308/* case 0x000C: * FORM FEED */
309/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000310 0, 1, 1, 1, 1, 1, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000312/* case 0x001C: * FILE SEPARATOR */
313/* case 0x001D: * GROUP SEPARATOR */
314/* case 0x001E: * RECORD SEPARATOR */
315/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000316 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 1, 0, 0, 0, 0, 0, 0, 0,
319 0, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000322
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 0, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000331};
332
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200333/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200334static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200335static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100336static int unicode_modifiable(PyObject *unicode);
337
Victor Stinnerfe226c02011-10-03 03:52:20 +0200338
Alexander Belopolsky40018472011-02-26 01:02:56 +0000339static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100340_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200341static PyObject *
342_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
343static PyObject *
344_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
345
346static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000347unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000348 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100349 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000350 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
351
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352static void
353raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300354 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100355 PyObject *unicode,
356 Py_ssize_t startpos, Py_ssize_t endpos,
357 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000358
Christian Heimes190d79e2008-01-30 11:58:22 +0000359/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200360static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000362/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000363/* 0x000B, * LINE TABULATION */
364/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000365/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000366 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000368/* 0x001C, * FILE SEPARATOR */
369/* 0x001D, * GROUP SEPARATOR */
370/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000371 0, 0, 0, 0, 1, 1, 1, 0,
372 0, 0, 0, 0, 0, 0, 0, 0,
373 0, 0, 0, 0, 0, 0, 0, 0,
374 0, 0, 0, 0, 0, 0, 0, 0,
375 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000376
Benjamin Peterson14339b62009-01-31 16:36:08 +0000377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000385};
386
INADA Naoki3ae20562017-01-16 20:41:20 +0900387static int convert_uc(PyObject *obj, void *addr);
388
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300389#include "clinic/unicodeobject.c.h"
390
Victor Stinner3d4226a2018-08-29 22:21:32 +0200391_Py_error_handler
392_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200393{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200395 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200396 }
397 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 }
400 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200401 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200402 }
403 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200404 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200405 }
406 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200407 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200408 }
409 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200410 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200411 }
412 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200413 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200414 }
Victor Stinner50149202015-09-22 00:26:54 +0200415 return _Py_ERROR_OTHER;
416}
417
Victor Stinner709d23d2019-05-02 14:56:30 -0400418
419static _Py_error_handler
420get_error_handler_wide(const wchar_t *errors)
421{
422 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
423 return _Py_ERROR_STRICT;
424 }
425 if (wcscmp(errors, L"surrogateescape") == 0) {
426 return _Py_ERROR_SURROGATEESCAPE;
427 }
428 if (wcscmp(errors, L"replace") == 0) {
429 return _Py_ERROR_REPLACE;
430 }
431 if (wcscmp(errors, L"ignore") == 0) {
432 return _Py_ERROR_IGNORE;
433 }
434 if (wcscmp(errors, L"backslashreplace") == 0) {
435 return _Py_ERROR_BACKSLASHREPLACE;
436 }
437 if (wcscmp(errors, L"surrogatepass") == 0) {
438 return _Py_ERROR_SURROGATEPASS;
439 }
440 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
441 return _Py_ERROR_XMLCHARREFREPLACE;
442 }
443 return _Py_ERROR_OTHER;
444}
445
446
Victor Stinner22eb6892019-06-26 00:51:05 +0200447static inline int
448unicode_check_encoding_errors(const char *encoding, const char *errors)
449{
450 if (encoding == NULL && errors == NULL) {
451 return 0;
452 }
453
Victor Stinner81a7be32020-04-14 15:14:01 +0200454 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200455#ifndef Py_DEBUG
456 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200457 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200458 return 0;
459 }
460#else
461 /* Always check in debug mode */
462#endif
463
464 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
465 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200466 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200467 return 0;
468 }
469
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200470 /* Disable checks during Python finalization. For example, it allows to
471 call _PyObject_Dump() during finalization for debugging purpose. */
472 if (interp->finalizing) {
473 return 0;
474 }
475
Victor Stinner22eb6892019-06-26 00:51:05 +0200476 if (encoding != NULL) {
477 PyObject *handler = _PyCodec_Lookup(encoding);
478 if (handler == NULL) {
479 return -1;
480 }
481 Py_DECREF(handler);
482 }
483
484 if (errors != NULL) {
485 PyObject *handler = PyCodec_LookupError(errors);
486 if (handler == NULL) {
487 return -1;
488 }
489 Py_DECREF(handler);
490 }
491 return 0;
492}
493
494
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300495/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
496 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000497Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000498PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000499{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000500#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000501 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000502#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000503 /* This is actually an illegal character, so it should
504 not be passed to unichr. */
505 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000506#endif
507}
508
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200509int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100510_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200511{
Victor Stinner68762572019-10-07 18:42:01 +0200512#define CHECK(expr) \
513 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
514
Victor Stinner910337b2011-10-03 03:20:16 +0200515 PyASCIIObject *ascii;
516 unsigned int kind;
517
Victor Stinner68762572019-10-07 18:42:01 +0200518 assert(op != NULL);
519 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200520
521 ascii = (PyASCIIObject *)op;
522 kind = ascii->state.kind;
523
Victor Stinnera3b334d2011-10-03 13:53:37 +0200524 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND);
526 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200527 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200528 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200529 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200530 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200531
Victor Stinnera41463c2011-10-04 01:05:08 +0200532 if (ascii->state.compact == 1) {
533 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200534 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200535 || kind == PyUnicode_2BYTE_KIND
536 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->state.ascii == 0);
538 CHECK(ascii->state.ready == 1);
539 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100540 }
541 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
543
544 data = unicode->data.any;
545 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(ascii->length == 0);
547 CHECK(ascii->hash == -1);
548 CHECK(ascii->state.compact == 0);
549 CHECK(ascii->state.ascii == 0);
550 CHECK(ascii->state.ready == 0);
551 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
552 CHECK(ascii->wstr != NULL);
553 CHECK(data == NULL);
554 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200555 }
556 else {
Victor Stinner68762572019-10-07 18:42:01 +0200557 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200558 || kind == PyUnicode_2BYTE_KIND
559 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200560 CHECK(ascii->state.compact == 0);
561 CHECK(ascii->state.ready == 1);
562 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200563 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200564 CHECK(compact->utf8 == data);
565 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 }
567 else
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200569 }
570 }
571 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200572 if (
573#if SIZEOF_WCHAR_T == 2
574 kind == PyUnicode_2BYTE_KIND
575#else
576 kind == PyUnicode_4BYTE_KIND
577#endif
578 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 {
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(ascii->wstr == data);
581 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200582 } else
Victor Stinner68762572019-10-07 18:42:01 +0200583 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200584 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200585
586 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200587 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200588 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200589 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200590 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200591
592 /* check that the best kind is used: O(n) operation */
593 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 Py_ssize_t i;
595 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300596 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200597 Py_UCS4 ch;
598
599 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200600 for (i=0; i < ascii->length; i++)
601 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200602 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200603 if (ch > maxchar)
604 maxchar = ch;
605 }
606 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100607 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200608 CHECK(maxchar >= 128);
609 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100610 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200611 else
Victor Stinner68762572019-10-07 18:42:01 +0200612 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200613 }
Victor Stinner77faf692011-11-20 18:56:05 +0100614 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200615 CHECK(maxchar >= 0x100);
616 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100617 }
618 else {
Victor Stinner68762572019-10-07 18:42:01 +0200619 CHECK(maxchar >= 0x10000);
620 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100621 }
Victor Stinner68762572019-10-07 18:42:01 +0200622 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200623 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400624 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200625
626#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400627}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200628
Victor Stinner910337b2011-10-03 03:20:16 +0200629
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100630static PyObject*
631unicode_result_wchar(PyObject *unicode)
632{
633#ifndef Py_DEBUG
634 Py_ssize_t len;
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 len = _PyUnicode_WSTR_LENGTH(unicode);
637 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200639 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100640 }
641
642 if (len == 1) {
643 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100644 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100645 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
646 Py_DECREF(unicode);
647 return latin1_char;
648 }
649 }
650
651 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200652 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 return NULL;
654 }
655#else
Victor Stinneraa771272012-10-04 02:32:58 +0200656 assert(Py_REFCNT(unicode) == 1);
657
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100658 /* don't make the result ready in debug mode to ensure that the caller
659 makes the string ready before using it */
660 assert(_PyUnicode_CheckConsistency(unicode, 1));
661#endif
662 return unicode;
663}
664
665static PyObject*
666unicode_result_ready(PyObject *unicode)
667{
668 Py_ssize_t length;
669
670 length = PyUnicode_GET_LENGTH(unicode);
671 if (length == 0) {
672 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100673 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200674 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100675 }
676 return unicode_empty;
677 }
678
Victor Stinner607b1022020-05-05 18:50:30 +0200679#ifdef LATIN1_SINGLETONS
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100680 if (length == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300681 const void *data = PyUnicode_DATA(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +0200682 int kind = PyUnicode_KIND(unicode);
683 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100684 if (ch < 256) {
685 PyObject *latin1_char = unicode_latin1[ch];
686 if (latin1_char != NULL) {
687 if (unicode != latin1_char) {
688 Py_INCREF(latin1_char);
689 Py_DECREF(unicode);
690 }
691 return latin1_char;
692 }
693 else {
694 assert(_PyUnicode_CheckConsistency(unicode, 1));
695 Py_INCREF(unicode);
696 unicode_latin1[ch] = unicode;
697 return unicode;
698 }
699 }
700 }
Victor Stinner607b1022020-05-05 18:50:30 +0200701#endif
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100702
703 assert(_PyUnicode_CheckConsistency(unicode, 1));
704 return unicode;
705}
706
707static PyObject*
708unicode_result(PyObject *unicode)
709{
710 assert(_PyUnicode_CHECK(unicode));
711 if (PyUnicode_IS_READY(unicode))
712 return unicode_result_ready(unicode);
713 else
714 return unicode_result_wchar(unicode);
715}
716
Victor Stinnerc4b49542011-12-11 22:44:26 +0100717static PyObject*
718unicode_result_unchanged(PyObject *unicode)
719{
720 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500721 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722 return NULL;
723 Py_INCREF(unicode);
724 return unicode;
725 }
726 else
727 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100728 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100729}
730
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
732 ASCII, Latin1, UTF-8, etc. */
733static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200734backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
736{
Victor Stinnerad771582015-10-09 12:38:53 +0200737 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200738 Py_UCS4 ch;
739 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300740 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200741
742 assert(PyUnicode_IS_READY(unicode));
743 kind = PyUnicode_KIND(unicode);
744 data = PyUnicode_DATA(unicode);
745
746 size = 0;
747 /* determine replacement size */
748 for (i = collstart; i < collend; ++i) {
749 Py_ssize_t incr;
750
751 ch = PyUnicode_READ(kind, data, i);
752 if (ch < 0x100)
753 incr = 2+2;
754 else if (ch < 0x10000)
755 incr = 2+4;
756 else {
757 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200758 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200759 }
760 if (size > PY_SSIZE_T_MAX - incr) {
761 PyErr_SetString(PyExc_OverflowError,
762 "encoded result is too long for a Python string");
763 return NULL;
764 }
765 size += incr;
766 }
767
Victor Stinnerad771582015-10-09 12:38:53 +0200768 str = _PyBytesWriter_Prepare(writer, str, size);
769 if (str == NULL)
770 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771
772 /* generate replacement */
773 for (i = collstart; i < collend; ++i) {
774 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200775 *str++ = '\\';
776 if (ch >= 0x00010000) {
777 *str++ = 'U';
778 *str++ = Py_hexdigits[(ch>>28)&0xf];
779 *str++ = Py_hexdigits[(ch>>24)&0xf];
780 *str++ = Py_hexdigits[(ch>>20)&0xf];
781 *str++ = Py_hexdigits[(ch>>16)&0xf];
782 *str++ = Py_hexdigits[(ch>>12)&0xf];
783 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200784 }
Victor Stinner797485e2015-10-09 03:17:30 +0200785 else if (ch >= 0x100) {
786 *str++ = 'u';
787 *str++ = Py_hexdigits[(ch>>12)&0xf];
788 *str++ = Py_hexdigits[(ch>>8)&0xf];
789 }
790 else
791 *str++ = 'x';
792 *str++ = Py_hexdigits[(ch>>4)&0xf];
793 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200794 }
795 return str;
796}
797
798/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
799 ASCII, Latin1, UTF-8, etc. */
800static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200801xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200802 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
803{
Victor Stinnerad771582015-10-09 12:38:53 +0200804 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200805 Py_UCS4 ch;
806 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300807 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200808
809 assert(PyUnicode_IS_READY(unicode));
810 kind = PyUnicode_KIND(unicode);
811 data = PyUnicode_DATA(unicode);
812
813 size = 0;
814 /* determine replacement size */
815 for (i = collstart; i < collend; ++i) {
816 Py_ssize_t incr;
817
818 ch = PyUnicode_READ(kind, data, i);
819 if (ch < 10)
820 incr = 2+1+1;
821 else if (ch < 100)
822 incr = 2+2+1;
823 else if (ch < 1000)
824 incr = 2+3+1;
825 else if (ch < 10000)
826 incr = 2+4+1;
827 else if (ch < 100000)
828 incr = 2+5+1;
829 else if (ch < 1000000)
830 incr = 2+6+1;
831 else {
832 assert(ch <= MAX_UNICODE);
833 incr = 2+7+1;
834 }
835 if (size > PY_SSIZE_T_MAX - incr) {
836 PyErr_SetString(PyExc_OverflowError,
837 "encoded result is too long for a Python string");
838 return NULL;
839 }
840 size += incr;
841 }
842
Victor Stinnerad771582015-10-09 12:38:53 +0200843 str = _PyBytesWriter_Prepare(writer, str, size);
844 if (str == NULL)
845 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200846
847 /* generate replacement */
848 for (i = collstart; i < collend; ++i) {
849 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
850 }
851 return str;
852}
853
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854/* --- Bloom Filters ----------------------------------------------------- */
855
856/* stuff to implement simple "bloom filters" for Unicode characters.
857 to keep things simple, we use a single bitmask, using the least 5
858 bits from each unicode characters as the bit index. */
859
860/* the linebreak mask is set up by Unicode_Init below */
861
Antoine Pitrouf068f942010-01-13 14:19:12 +0000862#if LONG_BIT >= 128
863#define BLOOM_WIDTH 128
864#elif LONG_BIT >= 64
865#define BLOOM_WIDTH 64
866#elif LONG_BIT >= 32
867#define BLOOM_WIDTH 32
868#else
869#error "LONG_BIT is smaller than 32"
870#endif
871
Thomas Wouters477c8d52006-05-27 19:21:47 +0000872#define BLOOM_MASK unsigned long
873
Serhiy Storchaka05997252013-01-26 12:14:02 +0200874static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000875
Antoine Pitrouf068f942010-01-13 14:19:12 +0000876#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877
Benjamin Peterson29060642009-01-31 22:14:21 +0000878#define BLOOM_LINEBREAK(ch) \
879 ((ch) < 128U ? ascii_linebreak[(ch)] : \
880 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000881
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700882static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300883make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884{
Victor Stinnera85af502013-04-09 21:53:54 +0200885#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
886 do { \
887 TYPE *data = (TYPE *)PTR; \
888 TYPE *end = data + LEN; \
889 Py_UCS4 ch; \
890 for (; data != end; data++) { \
891 ch = *data; \
892 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
893 } \
894 break; \
895 } while (0)
896
Thomas Wouters477c8d52006-05-27 19:21:47 +0000897 /* calculate simple bloom-style bitmask for a given unicode string */
898
Antoine Pitrouf068f942010-01-13 14:19:12 +0000899 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000900
901 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200902 switch (kind) {
903 case PyUnicode_1BYTE_KIND:
904 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
905 break;
906 case PyUnicode_2BYTE_KIND:
907 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
908 break;
909 case PyUnicode_4BYTE_KIND:
910 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
911 break;
912 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700913 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200914 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000915 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200916
917#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000918}
919
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920static int
921ensure_unicode(PyObject *obj)
922{
923 if (!PyUnicode_Check(obj)) {
924 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200925 "must be str, not %.100s",
926 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300927 return -1;
928 }
929 return PyUnicode_READY(obj);
930}
931
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200932/* Compilation of templated routines */
933
934#include "stringlib/asciilib.h"
935#include "stringlib/fastsearch.h"
936#include "stringlib/partition.h"
937#include "stringlib/split.h"
938#include "stringlib/count.h"
939#include "stringlib/find.h"
940#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200941#include "stringlib/undef.h"
942
943#include "stringlib/ucs1lib.h"
944#include "stringlib/fastsearch.h"
945#include "stringlib/partition.h"
946#include "stringlib/split.h"
947#include "stringlib/count.h"
948#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300949#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200950#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200951#include "stringlib/undef.h"
952
953#include "stringlib/ucs2lib.h"
954#include "stringlib/fastsearch.h"
955#include "stringlib/partition.h"
956#include "stringlib/split.h"
957#include "stringlib/count.h"
958#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300959#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200960#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200961#include "stringlib/undef.h"
962
963#include "stringlib/ucs4lib.h"
964#include "stringlib/fastsearch.h"
965#include "stringlib/partition.h"
966#include "stringlib/split.h"
967#include "stringlib/count.h"
968#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300969#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200970#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200971#include "stringlib/undef.h"
972
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200973#include "stringlib/unicodedefs.h"
974#include "stringlib/fastsearch.h"
975#include "stringlib/count.h"
976#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100977#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 if (length == 0 && unicode_empty != NULL) {
1228 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001229 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 }
1231
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001232 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001233 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001234 return (PyUnicodeObject *)PyErr_NoMemory();
1235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001236 if (length < 0) {
1237 PyErr_SetString(PyExc_SystemError,
1238 "Negative size passed to _PyUnicode_New");
1239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 }
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1243 if (unicode == NULL)
1244 return NULL;
1245 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001246
1247 _PyUnicode_WSTR_LENGTH(unicode) = length;
1248 _PyUnicode_HASH(unicode) = -1;
1249 _PyUnicode_STATE(unicode).interned = 0;
1250 _PyUnicode_STATE(unicode).kind = 0;
1251 _PyUnicode_STATE(unicode).compact = 0;
1252 _PyUnicode_STATE(unicode).ready = 0;
1253 _PyUnicode_STATE(unicode).ascii = 0;
1254 _PyUnicode_DATA_ANY(unicode) = NULL;
1255 _PyUnicode_LENGTH(unicode) = 0;
1256 _PyUnicode_UTF8(unicode) = NULL;
1257 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1260 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001261 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001262 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001263 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001265
Jeremy Hyltond8082792003-09-16 19:41:39 +00001266 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001267 * the caller fails before initializing str -- unicode_resize()
1268 * reads str[0], and the Keep-Alive optimization can keep memory
1269 * allocated for str alive across a call to unicode_dealloc(unicode).
1270 * We don't want unicode_resize to read uninitialized memory in
1271 * that case.
1272 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 _PyUnicode_WSTR(unicode)[0] = 0;
1274 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001275
Victor Stinner7931d9a2011-11-04 00:22:48 +01001276 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 return unicode;
1278}
1279
Victor Stinnerf42dc442011-10-02 23:33:16 +02001280static const char*
1281unicode_kind_name(PyObject *unicode)
1282{
Victor Stinner42dfd712011-10-03 14:41:45 +02001283 /* don't check consistency: unicode_kind_name() is called from
1284 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001285 if (!PyUnicode_IS_COMPACT(unicode))
1286 {
1287 if (!PyUnicode_IS_READY(unicode))
1288 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001289 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001290 {
1291 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001292 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001293 return "legacy ascii";
1294 else
1295 return "legacy latin1";
1296 case PyUnicode_2BYTE_KIND:
1297 return "legacy UCS2";
1298 case PyUnicode_4BYTE_KIND:
1299 return "legacy UCS4";
1300 default:
1301 return "<legacy invalid kind>";
1302 }
1303 }
1304 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001305 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001306 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001307 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001308 return "ascii";
1309 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001310 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001311 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001312 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001313 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001314 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001315 default:
1316 return "<invalid compact kind>";
1317 }
1318}
1319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001322const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001323 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001324 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325}
1326
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001327const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001328 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329 return _PyUnicode_COMPACT_DATA(unicode);
1330}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001331const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001332 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001333 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1335 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1336 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1337 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1338 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1339 return PyUnicode_DATA(unicode);
1340}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001341
1342void
1343_PyUnicode_Dump(PyObject *op)
1344{
1345 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001346 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1347 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001348 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001349
Victor Stinnera849a4b2011-10-03 12:12:11 +02001350 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001351 {
1352 if (ascii->state.ascii)
1353 data = (ascii + 1);
1354 else
1355 data = (compact + 1);
1356 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001357 else
1358 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001359 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001360
Victor Stinnera849a4b2011-10-03 12:12:11 +02001361 if (ascii->wstr == data)
1362 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001363 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001364
Victor Stinnera3b334d2011-10-03 13:53:37 +02001365 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001366 printf(" (%zu), ", compact->wstr_length);
1367 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001368 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001369 }
1370 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001371 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001372 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001373}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374#endif
1375
1376PyObject *
1377PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1378{
1379 PyObject *obj;
1380 PyCompactUnicodeObject *unicode;
1381 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001382 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001383 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 Py_ssize_t char_size;
1385 Py_ssize_t struct_size;
1386
1387 /* Optimization for empty strings */
1388 if (size == 0 && unicode_empty != NULL) {
1389 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001390 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 }
1392
Victor Stinner9e9d6892011-10-04 01:02:02 +02001393 is_ascii = 0;
1394 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 struct_size = sizeof(PyCompactUnicodeObject);
1396 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001397 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398 char_size = 1;
1399 is_ascii = 1;
1400 struct_size = sizeof(PyASCIIObject);
1401 }
1402 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001403 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 char_size = 1;
1405 }
1406 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001407 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 char_size = 2;
1409 if (sizeof(wchar_t) == 2)
1410 is_sharing = 1;
1411 }
1412 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001413 if (maxchar > MAX_UNICODE) {
1414 PyErr_SetString(PyExc_SystemError,
1415 "invalid maximum character passed to PyUnicode_New");
1416 return NULL;
1417 }
Victor Stinner8f825062012-04-27 13:55:39 +02001418 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 char_size = 4;
1420 if (sizeof(wchar_t) == 4)
1421 is_sharing = 1;
1422 }
1423
1424 /* Ensure we won't overflow the size. */
1425 if (size < 0) {
1426 PyErr_SetString(PyExc_SystemError,
1427 "Negative size passed to PyUnicode_New");
1428 return NULL;
1429 }
1430 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1431 return PyErr_NoMemory();
1432
1433 /* Duplicated allocation code from _PyObject_New() instead of a call to
1434 * PyObject_New() so we are able to allocate space for the object and
1435 * it's data buffer.
1436 */
1437 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001438 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001440 }
1441 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442
1443 unicode = (PyCompactUnicodeObject *)obj;
1444 if (is_ascii)
1445 data = ((PyASCIIObject*)obj) + 1;
1446 else
1447 data = unicode + 1;
1448 _PyUnicode_LENGTH(unicode) = size;
1449 _PyUnicode_HASH(unicode) = -1;
1450 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001451 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 _PyUnicode_STATE(unicode).compact = 1;
1453 _PyUnicode_STATE(unicode).ready = 1;
1454 _PyUnicode_STATE(unicode).ascii = is_ascii;
1455 if (is_ascii) {
1456 ((char*)data)[size] = 0;
1457 _PyUnicode_WSTR(unicode) = NULL;
1458 }
Victor Stinner8f825062012-04-27 13:55:39 +02001459 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 ((char*)data)[size] = 0;
1461 _PyUnicode_WSTR(unicode) = NULL;
1462 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001464 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 else {
1467 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001468 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001469 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001471 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 ((Py_UCS4*)data)[size] = 0;
1473 if (is_sharing) {
1474 _PyUnicode_WSTR_LENGTH(unicode) = size;
1475 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1476 }
1477 else {
1478 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 }
1481 }
Victor Stinner8f825062012-04-27 13:55:39 +02001482#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001483 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001484#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001485 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 return obj;
1487}
1488
1489#if SIZEOF_WCHAR_T == 2
1490/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1491 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001492 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493
1494 This function assumes that unicode can hold one more code point than wstr
1495 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001496static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001498 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499{
1500 const wchar_t *iter;
1501 Py_UCS4 *ucs4_out;
1502
Victor Stinner910337b2011-10-03 03:20:16 +02001503 assert(unicode != NULL);
1504 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1506 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1507
1508 for (iter = begin; iter < end; ) {
1509 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1510 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001511 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1512 && (iter+1) < end
1513 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514 {
Victor Stinner551ac952011-11-29 22:58:13 +01001515 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 iter += 2;
1517 }
1518 else {
1519 *ucs4_out++ = *iter;
1520 iter++;
1521 }
1522 }
1523 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1524 _PyUnicode_GET_LENGTH(unicode)));
1525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526}
1527#endif
1528
Victor Stinnercd9950f2011-10-02 00:34:53 +02001529static int
Victor Stinner488fa492011-12-12 00:01:39 +01001530unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001531{
Victor Stinner488fa492011-12-12 00:01:39 +01001532 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001533 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001534 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001535 return -1;
1536 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001537 return 0;
1538}
1539
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001540static int
1541_copy_characters(PyObject *to, Py_ssize_t to_start,
1542 PyObject *from, Py_ssize_t from_start,
1543 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001545 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001546 const void *from_data;
1547 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548
Victor Stinneree4544c2012-05-09 22:24:08 +02001549 assert(0 <= how_many);
1550 assert(0 <= from_start);
1551 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001552 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001553 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001554 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555
Victor Stinnerd3f08822012-05-29 12:57:52 +02001556 assert(PyUnicode_Check(to));
1557 assert(PyUnicode_IS_READY(to));
1558 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1559
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001560 if (how_many == 0)
1561 return 0;
1562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001564 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001566 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567
Victor Stinnerf1852262012-06-16 16:38:26 +02001568#ifdef Py_DEBUG
1569 if (!check_maxchar
1570 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1571 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001572 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001573 Py_UCS4 ch;
1574 Py_ssize_t i;
1575 for (i=0; i < how_many; i++) {
1576 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1577 assert(ch <= to_maxchar);
1578 }
1579 }
1580#endif
1581
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001582 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001583 if (check_maxchar
1584 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1585 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001586 /* Writing Latin-1 characters into an ASCII string requires to
1587 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001588 Py_UCS4 max_char;
1589 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001590 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001591 if (max_char >= 128)
1592 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001593 }
Christian Heimesf051e432016-09-13 20:22:02 +02001594 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001595 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001596 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001598 else if (from_kind == PyUnicode_1BYTE_KIND
1599 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001600 {
1601 _PyUnicode_CONVERT_BYTES(
1602 Py_UCS1, Py_UCS2,
1603 PyUnicode_1BYTE_DATA(from) + from_start,
1604 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1605 PyUnicode_2BYTE_DATA(to) + to_start
1606 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001607 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001608 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001609 && to_kind == PyUnicode_4BYTE_KIND)
1610 {
1611 _PyUnicode_CONVERT_BYTES(
1612 Py_UCS1, Py_UCS4,
1613 PyUnicode_1BYTE_DATA(from) + from_start,
1614 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1615 PyUnicode_4BYTE_DATA(to) + to_start
1616 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001617 }
1618 else if (from_kind == PyUnicode_2BYTE_KIND
1619 && to_kind == PyUnicode_4BYTE_KIND)
1620 {
1621 _PyUnicode_CONVERT_BYTES(
1622 Py_UCS2, Py_UCS4,
1623 PyUnicode_2BYTE_DATA(from) + from_start,
1624 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1625 PyUnicode_4BYTE_DATA(to) + to_start
1626 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001628 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001629 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1630
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001631 if (!check_maxchar) {
1632 if (from_kind == PyUnicode_2BYTE_KIND
1633 && to_kind == PyUnicode_1BYTE_KIND)
1634 {
1635 _PyUnicode_CONVERT_BYTES(
1636 Py_UCS2, Py_UCS1,
1637 PyUnicode_2BYTE_DATA(from) + from_start,
1638 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1639 PyUnicode_1BYTE_DATA(to) + to_start
1640 );
1641 }
1642 else if (from_kind == PyUnicode_4BYTE_KIND
1643 && to_kind == PyUnicode_1BYTE_KIND)
1644 {
1645 _PyUnicode_CONVERT_BYTES(
1646 Py_UCS4, Py_UCS1,
1647 PyUnicode_4BYTE_DATA(from) + from_start,
1648 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1649 PyUnicode_1BYTE_DATA(to) + to_start
1650 );
1651 }
1652 else if (from_kind == PyUnicode_4BYTE_KIND
1653 && to_kind == PyUnicode_2BYTE_KIND)
1654 {
1655 _PyUnicode_CONVERT_BYTES(
1656 Py_UCS4, Py_UCS2,
1657 PyUnicode_4BYTE_DATA(from) + from_start,
1658 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1659 PyUnicode_2BYTE_DATA(to) + to_start
1660 );
1661 }
1662 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001663 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001664 }
1665 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001666 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001667 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001668 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001669 Py_ssize_t i;
1670
Victor Stinnera0702ab2011-09-29 14:14:38 +02001671 for (i=0; i < how_many; i++) {
1672 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001673 if (ch > to_maxchar)
1674 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001675 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1676 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001677 }
1678 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001679 return 0;
1680}
1681
Victor Stinnerd3f08822012-05-29 12:57:52 +02001682void
1683_PyUnicode_FastCopyCharacters(
1684 PyObject *to, Py_ssize_t to_start,
1685 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686{
1687 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1688}
1689
1690Py_ssize_t
1691PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1692 PyObject *from, Py_ssize_t from_start,
1693 Py_ssize_t how_many)
1694{
1695 int err;
1696
1697 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1698 PyErr_BadInternalCall();
1699 return -1;
1700 }
1701
Benjamin Petersonbac79492012-01-14 13:34:47 -05001702 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001703 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001704 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001705 return -1;
1706
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001707 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001708 PyErr_SetString(PyExc_IndexError, "string index out of range");
1709 return -1;
1710 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001711 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001712 PyErr_SetString(PyExc_IndexError, "string index out of range");
1713 return -1;
1714 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001715 if (how_many < 0) {
1716 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1717 return -1;
1718 }
1719 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001720 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1721 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001722 "Cannot write %zi characters at %zi "
1723 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001724 how_many, to_start, PyUnicode_GET_LENGTH(to));
1725 return -1;
1726 }
1727
1728 if (how_many == 0)
1729 return 0;
1730
Victor Stinner488fa492011-12-12 00:01:39 +01001731 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001732 return -1;
1733
1734 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1735 if (err) {
1736 PyErr_Format(PyExc_SystemError,
1737 "Cannot copy %s characters "
1738 "into a string of %s characters",
1739 unicode_kind_name(from),
1740 unicode_kind_name(to));
1741 return -1;
1742 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001743 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744}
1745
Victor Stinner17222162011-09-28 22:15:37 +02001746/* Find the maximum code point and count the number of surrogate pairs so a
1747 correct string length can be computed before converting a string to UCS4.
1748 This function counts single surrogates as a character and not as a pair.
1749
1750 Return 0 on success, or -1 on error. */
1751static int
1752find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1753 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754{
1755 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001756 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757
Victor Stinnerc53be962011-10-02 21:33:54 +02001758 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 *num_surrogates = 0;
1760 *maxchar = 0;
1761
1762 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001764 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1765 && (iter+1) < end
1766 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1767 {
1768 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1769 ++(*num_surrogates);
1770 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 }
1772 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001774 {
1775 ch = *iter;
1776 iter++;
1777 }
1778 if (ch > *maxchar) {
1779 *maxchar = ch;
1780 if (*maxchar > MAX_UNICODE) {
1781 PyErr_Format(PyExc_ValueError,
1782 "character U+%x is not in range [U+0000; U+10ffff]",
1783 ch);
1784 return -1;
1785 }
1786 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 }
1788 return 0;
1789}
1790
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001791int
1792_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793{
1794 wchar_t *end;
1795 Py_UCS4 maxchar = 0;
1796 Py_ssize_t num_surrogates;
1797#if SIZEOF_WCHAR_T == 2
1798 Py_ssize_t length_wo_surrogates;
1799#endif
1800
Georg Brandl7597add2011-10-05 16:36:47 +02001801 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001802 strings were created using _PyObject_New() and where no canonical
1803 representation (the str field) has been set yet aka strings
1804 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001805 assert(_PyUnicode_CHECK(unicode));
1806 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001808 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001809 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001810 /* Actually, it should neither be interned nor be anything else: */
1811 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001814 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001815 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817
1818 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001819 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1820 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 PyErr_NoMemory();
1822 return -1;
1823 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001824 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 _PyUnicode_WSTR(unicode), end,
1826 PyUnicode_1BYTE_DATA(unicode));
1827 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1828 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1829 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1830 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001831 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001832 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001833 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 }
1835 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001836 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001837 _PyUnicode_UTF8(unicode) = NULL;
1838 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 }
1840 PyObject_FREE(_PyUnicode_WSTR(unicode));
1841 _PyUnicode_WSTR(unicode) = NULL;
1842 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1843 }
1844 /* In this case we might have to convert down from 4-byte native
1845 wchar_t to 2-byte unicode. */
1846 else if (maxchar < 65536) {
1847 assert(num_surrogates == 0 &&
1848 "FindMaxCharAndNumSurrogatePairs() messed up");
1849
Victor Stinner506f5922011-09-28 22:34:18 +02001850#if SIZEOF_WCHAR_T == 2
1851 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001852 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001853 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1854 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1855 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001856 _PyUnicode_UTF8(unicode) = NULL;
1857 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001858#else
1859 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001860 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001861 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001862 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001863 PyErr_NoMemory();
1864 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001865 }
Victor Stinner506f5922011-09-28 22:34:18 +02001866 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1867 _PyUnicode_WSTR(unicode), end,
1868 PyUnicode_2BYTE_DATA(unicode));
1869 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1870 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1871 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001872 _PyUnicode_UTF8(unicode) = NULL;
1873 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001874 PyObject_FREE(_PyUnicode_WSTR(unicode));
1875 _PyUnicode_WSTR(unicode) = NULL;
1876 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1877#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878 }
1879 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1880 else {
1881#if SIZEOF_WCHAR_T == 2
1882 /* in case the native representation is 2-bytes, we need to allocate a
1883 new normalized 4-byte version. */
1884 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001885 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1886 PyErr_NoMemory();
1887 return -1;
1888 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001889 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1890 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 PyErr_NoMemory();
1892 return -1;
1893 }
1894 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1895 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001896 _PyUnicode_UTF8(unicode) = NULL;
1897 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001898 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1899 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001900 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 PyObject_FREE(_PyUnicode_WSTR(unicode));
1902 _PyUnicode_WSTR(unicode) = NULL;
1903 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1904#else
1905 assert(num_surrogates == 0);
1906
Victor Stinnerc3c74152011-10-02 20:39:55 +02001907 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001909 _PyUnicode_UTF8(unicode) = NULL;
1910 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1912#endif
1913 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1914 }
1915 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001916 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001917 return 0;
1918}
1919
Alexander Belopolsky40018472011-02-26 01:02:56 +00001920static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001921unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922{
Walter Dörwald16807132007-05-25 13:52:07 +00001923 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001924 case SSTATE_NOT_INTERNED:
1925 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001926
Benjamin Peterson29060642009-01-31 22:14:21 +00001927 case SSTATE_INTERNED_MORTAL:
1928 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001929 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001930#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001931 if (PyDict_DelItem(interned, unicode) != 0) {
1932 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1933 NULL);
1934 }
Victor Stinner607b1022020-05-05 18:50:30 +02001935#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001936 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001937
Benjamin Peterson29060642009-01-31 22:14:21 +00001938 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001939 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1940 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001941
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001943 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001944 }
1945
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001946 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001948 }
1949 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001950 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001951 }
1952 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001953 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001956 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957}
1958
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001959#ifdef Py_DEBUG
1960static int
1961unicode_is_singleton(PyObject *unicode)
1962{
Victor Stinner607b1022020-05-05 18:50:30 +02001963 if (unicode == unicode_empty) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001964 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001965 }
1966#ifdef LATIN1_SINGLETONS
1967 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001968 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1969 {
1970 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1971 if (ch < 256 && unicode_latin1[ch] == unicode)
1972 return 1;
1973 }
Victor Stinner607b1022020-05-05 18:50:30 +02001974#endif
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001975 return 0;
1976}
1977#endif
1978
Alexander Belopolsky40018472011-02-26 01:02:56 +00001979static int
Victor Stinner488fa492011-12-12 00:01:39 +01001980unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001981{
Victor Stinner488fa492011-12-12 00:01:39 +01001982 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001983 if (Py_REFCNT(unicode) != 1)
1984 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001985 if (_PyUnicode_HASH(unicode) != -1)
1986 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001987 if (PyUnicode_CHECK_INTERNED(unicode))
1988 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001989 if (!PyUnicode_CheckExact(unicode))
1990 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001991#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001992 /* singleton refcount is greater than 1 */
1993 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001994#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001995 return 1;
1996}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001997
Victor Stinnerfe226c02011-10-03 03:52:20 +02001998static int
1999unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2000{
2001 PyObject *unicode;
2002 Py_ssize_t old_length;
2003
2004 assert(p_unicode != NULL);
2005 unicode = *p_unicode;
2006
2007 assert(unicode != NULL);
2008 assert(PyUnicode_Check(unicode));
2009 assert(0 <= length);
2010
Victor Stinner910337b2011-10-03 03:20:16 +02002011 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002012 old_length = PyUnicode_WSTR_LENGTH(unicode);
2013 else
2014 old_length = PyUnicode_GET_LENGTH(unicode);
2015 if (old_length == length)
2016 return 0;
2017
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002018 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002019 _Py_INCREF_UNICODE_EMPTY();
2020 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00002021 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002022 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002023 return 0;
2024 }
2025
Victor Stinner488fa492011-12-12 00:01:39 +01002026 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002027 PyObject *copy = resize_copy(unicode, length);
2028 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002029 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002030 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002032 }
2033
Victor Stinnerfe226c02011-10-03 03:52:20 +02002034 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002035 PyObject *new_unicode = resize_compact(unicode, length);
2036 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002037 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002038 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002039 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002040 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002041 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002042}
2043
Alexander Belopolsky40018472011-02-26 01:02:56 +00002044int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002045PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002046{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002047 PyObject *unicode;
2048 if (p_unicode == NULL) {
2049 PyErr_BadInternalCall();
2050 return -1;
2051 }
2052 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002053 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002054 {
2055 PyErr_BadInternalCall();
2056 return -1;
2057 }
2058 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002059}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002060
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002061/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002062
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002063 WARNING: The function doesn't copy the terminating null character and
2064 doesn't check the maximum character (may write a latin1 character in an
2065 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002066static void
2067unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2068 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002069{
2070 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002071 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002072 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002073
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002074 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002075 switch (kind) {
2076 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002077#ifdef Py_DEBUG
2078 if (PyUnicode_IS_ASCII(unicode)) {
2079 Py_UCS4 maxchar = ucs1lib_find_max_char(
2080 (const Py_UCS1*)str,
2081 (const Py_UCS1*)str + len);
2082 assert(maxchar < 128);
2083 }
2084#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002085 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002086 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002087 }
2088 case PyUnicode_2BYTE_KIND: {
2089 Py_UCS2 *start = (Py_UCS2 *)data + index;
2090 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002091
Victor Stinner184252a2012-06-16 02:57:41 +02002092 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002093 *ucs2 = (Py_UCS2)*str;
2094
2095 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002096 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002097 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002098 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002099 Py_UCS4 *start = (Py_UCS4 *)data + index;
2100 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002101
Victor Stinner184252a2012-06-16 02:57:41 +02002102 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002103 *ucs4 = (Py_UCS4)*str;
2104
2105 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002106 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002107 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002108 default:
2109 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002110 }
2111}
2112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113static PyObject*
2114get_latin1_char(unsigned char ch)
2115{
Victor Stinner607b1022020-05-05 18:50:30 +02002116 PyObject *unicode;
2117
2118#ifdef LATIN1_SINGLETONS
2119 unicode = unicode_latin1[ch];
2120 if (unicode) {
2121 Py_INCREF(unicode);
2122 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 }
Victor Stinner607b1022020-05-05 18:50:30 +02002124#endif
2125
2126 unicode = PyUnicode_New(1, ch);
2127 if (!unicode) {
2128 return NULL;
2129 }
2130
2131 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2132 assert(_PyUnicode_CheckConsistency(unicode, 1));
2133
2134#ifdef LATIN1_SINGLETONS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135 Py_INCREF(unicode);
Victor Stinner607b1022020-05-05 18:50:30 +02002136 unicode_latin1[ch] = unicode;
2137#endif
Victor Stinnera464fc12011-10-02 20:39:30 +02002138 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139}
2140
Victor Stinner985a82a2014-01-03 12:53:47 +01002141static PyObject*
2142unicode_char(Py_UCS4 ch)
2143{
2144 PyObject *unicode;
2145
2146 assert(ch <= MAX_UNICODE);
2147
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002148 if (ch < 256)
2149 return get_latin1_char(ch);
2150
Victor Stinner985a82a2014-01-03 12:53:47 +01002151 unicode = PyUnicode_New(1, ch);
2152 if (unicode == NULL)
2153 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002154
2155 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2156 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002157 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002158 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002159 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2160 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2161 }
2162 assert(_PyUnicode_CheckConsistency(unicode, 1));
2163 return unicode;
2164}
2165
Alexander Belopolsky40018472011-02-26 01:02:56 +00002166PyObject *
2167PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002169 if (u == NULL)
2170 return (PyObject*)_PyUnicode_New(size);
2171
2172 if (size < 0) {
2173 PyErr_BadInternalCall();
2174 return NULL;
2175 }
2176
2177 return PyUnicode_FromWideChar(u, size);
2178}
2179
2180PyObject *
2181PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2182{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002183 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 Py_UCS4 maxchar = 0;
2185 Py_ssize_t num_surrogates;
2186
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002187 if (u == NULL && size != 0) {
2188 PyErr_BadInternalCall();
2189 return NULL;
2190 }
2191
2192 if (size == -1) {
2193 size = wcslen(u);
2194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002196 /* If the Unicode data is known at construction time, we can apply
2197 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002200 if (size == 0)
2201 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 /* Single character Unicode objects in the Latin-1 range are
2204 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002205 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 return get_latin1_char((unsigned char)*u);
2207
2208 /* If not empty and not single character, copy the Unicode data
2209 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002210 if (find_maxchar_surrogates(u, u + size,
2211 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 return NULL;
2213
Victor Stinner8faf8212011-12-08 22:14:11 +01002214 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 if (!unicode)
2216 return NULL;
2217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 switch (PyUnicode_KIND(unicode)) {
2219 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002220 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2222 break;
2223 case PyUnicode_2BYTE_KIND:
2224#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002225 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002227 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2229#endif
2230 break;
2231 case PyUnicode_4BYTE_KIND:
2232#if SIZEOF_WCHAR_T == 2
2233 /* This is the only case which has to process surrogates, thus
2234 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002235 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236#else
2237 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002238 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239#endif
2240 break;
2241 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002242 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002245 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246}
2247
Alexander Belopolsky40018472011-02-26 01:02:56 +00002248PyObject *
2249PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002250{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002251 if (size < 0) {
2252 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002253 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002254 return NULL;
2255 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002256 if (u != NULL)
2257 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2258 else
2259 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002260}
2261
Alexander Belopolsky40018472011-02-26 01:02:56 +00002262PyObject *
2263PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002264{
2265 size_t size = strlen(u);
2266 if (size > PY_SSIZE_T_MAX) {
2267 PyErr_SetString(PyExc_OverflowError, "input too long");
2268 return NULL;
2269 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002270 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002271}
2272
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002273PyObject *
2274_PyUnicode_FromId(_Py_Identifier *id)
2275{
Victor Stinner297257f2020-06-02 14:39:45 +02002276 if (id->object) {
2277 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002278 }
Victor Stinner297257f2020-06-02 14:39:45 +02002279
2280 PyObject *obj;
2281 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2282 strlen(id->string),
2283 NULL, NULL);
2284 if (!obj) {
2285 return NULL;
2286 }
2287 PyUnicode_InternInPlace(&obj);
2288
2289 assert(!id->next);
2290 id->object = obj;
2291 id->next = static_strings;
2292 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002293 return id->object;
2294}
2295
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002296static void
2297unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002298{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002299 _Py_Identifier *tmp, *s = static_strings;
2300 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002301 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002302 tmp = s->next;
2303 s->next = NULL;
2304 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002305 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002306 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002307}
2308
Benjamin Peterson0df54292012-03-26 14:50:32 -04002309/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002310
Victor Stinnerd3f08822012-05-29 12:57:52 +02002311PyObject*
2312_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002313{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002314 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002315 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002316 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002317#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002318 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002319#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002320 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002321 }
Victor Stinner785938e2011-12-11 20:09:03 +01002322 unicode = PyUnicode_New(size, 127);
2323 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002324 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002325 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2326 assert(_PyUnicode_CheckConsistency(unicode, 1));
2327 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002328}
2329
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002330static Py_UCS4
2331kind_maxchar_limit(unsigned int kind)
2332{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002333 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002334 case PyUnicode_1BYTE_KIND:
2335 return 0x80;
2336 case PyUnicode_2BYTE_KIND:
2337 return 0x100;
2338 case PyUnicode_4BYTE_KIND:
2339 return 0x10000;
2340 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002341 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002342 }
2343}
2344
Victor Stinner702c7342011-10-05 13:50:52 +02002345static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002346_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002349 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002350
Serhiy Storchaka678db842013-01-26 12:16:36 +02002351 if (size == 0)
2352 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002353 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002354 if (size == 1)
2355 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002356
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002357 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002358 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359 if (!res)
2360 return NULL;
2361 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002362 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002363 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002364}
2365
Victor Stinnere57b1c02011-09-28 22:20:48 +02002366static PyObject*
2367_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002368{
2369 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002370 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002371
Serhiy Storchaka678db842013-01-26 12:16:36 +02002372 if (size == 0)
2373 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002374 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002375 if (size == 1)
2376 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002377
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002378 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002379 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 if (!res)
2381 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002382 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002384 else {
2385 _PyUnicode_CONVERT_BYTES(
2386 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002388 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 return res;
2390}
2391
Victor Stinnere57b1c02011-09-28 22:20:48 +02002392static PyObject*
2393_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394{
2395 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002396 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002397
Serhiy Storchaka678db842013-01-26 12:16:36 +02002398 if (size == 0)
2399 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002400 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002401 if (size == 1)
2402 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002403
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002404 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002405 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 if (!res)
2407 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002408 if (max_char < 256)
2409 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2410 PyUnicode_1BYTE_DATA(res));
2411 else if (max_char < 0x10000)
2412 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2413 PyUnicode_2BYTE_DATA(res));
2414 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002416 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 return res;
2418}
2419
2420PyObject*
2421PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2422{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002423 if (size < 0) {
2424 PyErr_SetString(PyExc_ValueError, "size must be positive");
2425 return NULL;
2426 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002427 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002429 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002431 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002433 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002434 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002435 PyErr_SetString(PyExc_SystemError, "invalid kind");
2436 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438}
2439
Victor Stinnerece58de2012-04-23 23:36:38 +02002440Py_UCS4
2441_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2442{
2443 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002444 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002445
2446 assert(PyUnicode_IS_READY(unicode));
2447 assert(0 <= start);
2448 assert(end <= PyUnicode_GET_LENGTH(unicode));
2449 assert(start <= end);
2450
2451 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2452 return PyUnicode_MAX_CHAR_VALUE(unicode);
2453
2454 if (start == end)
2455 return 127;
2456
Victor Stinner94d558b2012-04-27 22:26:58 +02002457 if (PyUnicode_IS_ASCII(unicode))
2458 return 127;
2459
Victor Stinnerece58de2012-04-23 23:36:38 +02002460 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002461 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002462 endptr = (char *)startptr + end * kind;
2463 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002464 switch(kind) {
2465 case PyUnicode_1BYTE_KIND:
2466 return ucs1lib_find_max_char(startptr, endptr);
2467 case PyUnicode_2BYTE_KIND:
2468 return ucs2lib_find_max_char(startptr, endptr);
2469 case PyUnicode_4BYTE_KIND:
2470 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002471 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002472 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002473 }
2474}
2475
Victor Stinner25a4b292011-10-06 12:31:55 +02002476/* Ensure that a string uses the most efficient storage, if it is not the
2477 case: create a new string with of the right kind. Write NULL into *p_unicode
2478 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002479static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002480unicode_adjust_maxchar(PyObject **p_unicode)
2481{
2482 PyObject *unicode, *copy;
2483 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002484 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002485 unsigned int kind;
2486
2487 assert(p_unicode != NULL);
2488 unicode = *p_unicode;
2489 assert(PyUnicode_IS_READY(unicode));
2490 if (PyUnicode_IS_ASCII(unicode))
2491 return;
2492
2493 len = PyUnicode_GET_LENGTH(unicode);
2494 kind = PyUnicode_KIND(unicode);
2495 if (kind == PyUnicode_1BYTE_KIND) {
2496 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002497 max_char = ucs1lib_find_max_char(u, u + len);
2498 if (max_char >= 128)
2499 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002500 }
2501 else if (kind == PyUnicode_2BYTE_KIND) {
2502 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002503 max_char = ucs2lib_find_max_char(u, u + len);
2504 if (max_char >= 256)
2505 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002506 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002507 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002508 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002509 max_char = ucs4lib_find_max_char(u, u + len);
2510 if (max_char >= 0x10000)
2511 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002512 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002513 else
2514 Py_UNREACHABLE();
2515
Victor Stinner25a4b292011-10-06 12:31:55 +02002516 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002517 if (copy != NULL)
2518 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002519 Py_DECREF(unicode);
2520 *p_unicode = copy;
2521}
2522
Victor Stinner034f6cf2011-09-30 02:26:44 +02002523PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002524_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002525{
Victor Stinner87af4f22011-11-21 23:03:47 +01002526 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002527 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002528
Victor Stinner034f6cf2011-09-30 02:26:44 +02002529 if (!PyUnicode_Check(unicode)) {
2530 PyErr_BadInternalCall();
2531 return NULL;
2532 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002533 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002534 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002535
Victor Stinner87af4f22011-11-21 23:03:47 +01002536 length = PyUnicode_GET_LENGTH(unicode);
2537 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002538 if (!copy)
2539 return NULL;
2540 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2541
Christian Heimesf051e432016-09-13 20:22:02 +02002542 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002543 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002544 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002545 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002546}
2547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548
Victor Stinnerbc603d12011-10-02 01:00:40 +02002549/* Widen Unicode objects to larger buffers. Don't write terminating null
2550 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002552static void*
2553unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002555 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002556
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002557 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002558 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002559 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002560 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002561 if (!result)
2562 return PyErr_NoMemory();
2563 assert(skind == PyUnicode_1BYTE_KIND);
2564 _PyUnicode_CONVERT_BYTES(
2565 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002566 (const Py_UCS1 *)data,
2567 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002568 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002570 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002571 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002572 if (!result)
2573 return PyErr_NoMemory();
2574 if (skind == PyUnicode_2BYTE_KIND) {
2575 _PyUnicode_CONVERT_BYTES(
2576 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002577 (const Py_UCS2 *)data,
2578 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002579 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002581 else {
2582 assert(skind == PyUnicode_1BYTE_KIND);
2583 _PyUnicode_CONVERT_BYTES(
2584 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002585 (const Py_UCS1 *)data,
2586 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002587 result);
2588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002590 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002591 Py_UNREACHABLE();
2592 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594}
2595
2596static Py_UCS4*
2597as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2598 int copy_null)
2599{
2600 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002601 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 Py_ssize_t len, targetlen;
2603 if (PyUnicode_READY(string) == -1)
2604 return NULL;
2605 kind = PyUnicode_KIND(string);
2606 data = PyUnicode_DATA(string);
2607 len = PyUnicode_GET_LENGTH(string);
2608 targetlen = len;
2609 if (copy_null)
2610 targetlen++;
2611 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002612 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 if (!target) {
2614 PyErr_NoMemory();
2615 return NULL;
2616 }
2617 }
2618 else {
2619 if (targetsize < targetlen) {
2620 PyErr_Format(PyExc_SystemError,
2621 "string is longer than the buffer");
2622 if (copy_null && 0 < targetsize)
2623 target[0] = 0;
2624 return NULL;
2625 }
2626 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002627 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002628 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002629 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002631 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002632 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002633 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2634 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002635 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002636 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002637 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002638 else {
2639 Py_UNREACHABLE();
2640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 if (copy_null)
2642 target[len] = 0;
2643 return target;
2644}
2645
2646Py_UCS4*
2647PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2648 int copy_null)
2649{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002650 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651 PyErr_BadInternalCall();
2652 return NULL;
2653 }
2654 return as_ucs4(string, target, targetsize, copy_null);
2655}
2656
2657Py_UCS4*
2658PyUnicode_AsUCS4Copy(PyObject *string)
2659{
2660 return as_ucs4(string, NULL, 0, 1);
2661}
2662
Victor Stinner15a11362012-10-06 23:48:20 +02002663/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002664 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2665 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2666#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002667
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002668static int
2669unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2670 Py_ssize_t width, Py_ssize_t precision)
2671{
2672 Py_ssize_t length, fill, arglen;
2673 Py_UCS4 maxchar;
2674
2675 if (PyUnicode_READY(str) == -1)
2676 return -1;
2677
2678 length = PyUnicode_GET_LENGTH(str);
2679 if ((precision == -1 || precision >= length)
2680 && width <= length)
2681 return _PyUnicodeWriter_WriteStr(writer, str);
2682
2683 if (precision != -1)
2684 length = Py_MIN(precision, length);
2685
2686 arglen = Py_MAX(length, width);
2687 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2688 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2689 else
2690 maxchar = writer->maxchar;
2691
2692 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2693 return -1;
2694
2695 if (width > length) {
2696 fill = width - length;
2697 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2698 return -1;
2699 writer->pos += fill;
2700 }
2701
2702 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2703 str, 0, length);
2704 writer->pos += length;
2705 return 0;
2706}
2707
2708static int
Victor Stinner998b8062018-09-12 00:23:25 +02002709unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002710 Py_ssize_t width, Py_ssize_t precision)
2711{
2712 /* UTF-8 */
2713 Py_ssize_t length;
2714 PyObject *unicode;
2715 int res;
2716
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002717 if (precision == -1) {
2718 length = strlen(str);
2719 }
2720 else {
2721 length = 0;
2722 while (length < precision && str[length]) {
2723 length++;
2724 }
2725 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002726 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2727 if (unicode == NULL)
2728 return -1;
2729
2730 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2731 Py_DECREF(unicode);
2732 return res;
2733}
2734
Victor Stinner96865452011-03-01 23:44:09 +00002735static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002736unicode_fromformat_arg(_PyUnicodeWriter *writer,
2737 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002738{
Victor Stinnere215d962012-10-06 23:03:36 +02002739 const char *p;
2740 Py_ssize_t len;
2741 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002742 Py_ssize_t width;
2743 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002744 int longflag;
2745 int longlongflag;
2746 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002748
2749 p = f;
2750 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002751 zeropad = 0;
2752 if (*f == '0') {
2753 zeropad = 1;
2754 f++;
2755 }
Victor Stinner96865452011-03-01 23:44:09 +00002756
2757 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002758 width = -1;
2759 if (Py_ISDIGIT((unsigned)*f)) {
2760 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002761 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002762 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002763 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002764 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002766 return NULL;
2767 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002768 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002769 f++;
2770 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002771 }
2772 precision = -1;
2773 if (*f == '.') {
2774 f++;
2775 if (Py_ISDIGIT((unsigned)*f)) {
2776 precision = (*f - '0');
2777 f++;
2778 while (Py_ISDIGIT((unsigned)*f)) {
2779 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2780 PyErr_SetString(PyExc_ValueError,
2781 "precision too big");
2782 return NULL;
2783 }
2784 precision = (precision * 10) + (*f - '0');
2785 f++;
2786 }
2787 }
Victor Stinner96865452011-03-01 23:44:09 +00002788 if (*f == '%') {
2789 /* "%.3%s" => f points to "3" */
2790 f--;
2791 }
2792 }
2793 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002794 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002795 f--;
2796 }
Victor Stinner96865452011-03-01 23:44:09 +00002797
2798 /* Handle %ld, %lu, %lld and %llu. */
2799 longflag = 0;
2800 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002801 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002802 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002803 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002804 longflag = 1;
2805 ++f;
2806 }
Victor Stinner96865452011-03-01 23:44:09 +00002807 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002808 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002809 longlongflag = 1;
2810 f += 2;
2811 }
Victor Stinner96865452011-03-01 23:44:09 +00002812 }
2813 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002814 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002815 size_tflag = 1;
2816 ++f;
2817 }
Victor Stinnere215d962012-10-06 23:03:36 +02002818
2819 if (f[1] == '\0')
2820 writer->overallocate = 0;
2821
2822 switch (*f) {
2823 case 'c':
2824 {
2825 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002826 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002827 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002828 "character argument not in range(0x110000)");
2829 return NULL;
2830 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002831 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002832 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002833 break;
2834 }
2835
2836 case 'i':
2837 case 'd':
2838 case 'u':
2839 case 'x':
2840 {
2841 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002842 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002843 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002844
2845 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002846 if (longflag) {
2847 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2848 }
2849 else if (longlongflag) {
2850 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2851 }
2852 else if (size_tflag) {
2853 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2854 }
2855 else {
2856 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2857 }
Victor Stinnere215d962012-10-06 23:03:36 +02002858 }
2859 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002860 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002861 }
2862 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002863 if (longflag) {
2864 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2865 }
2866 else if (longlongflag) {
2867 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2868 }
2869 else if (size_tflag) {
2870 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2871 }
2872 else {
2873 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2874 }
Victor Stinnere215d962012-10-06 23:03:36 +02002875 }
2876 assert(len >= 0);
2877
Victor Stinnere215d962012-10-06 23:03:36 +02002878 if (precision < len)
2879 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002880
2881 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002882 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2883 return NULL;
2884
Victor Stinnere215d962012-10-06 23:03:36 +02002885 if (width > precision) {
2886 Py_UCS4 fillchar;
2887 fill = width - precision;
2888 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002889 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2890 return NULL;
2891 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002892 }
Victor Stinner15a11362012-10-06 23:48:20 +02002893 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002894 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002895 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2896 return NULL;
2897 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002898 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002899
Victor Stinner4a587072013-11-19 12:54:53 +01002900 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2901 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002902 break;
2903 }
2904
2905 case 'p':
2906 {
2907 char number[MAX_LONG_LONG_CHARS];
2908
2909 len = sprintf(number, "%p", va_arg(*vargs, void*));
2910 assert(len >= 0);
2911
2912 /* %p is ill-defined: ensure leading 0x. */
2913 if (number[1] == 'X')
2914 number[1] = 'x';
2915 else if (number[1] != 'x') {
2916 memmove(number + 2, number,
2917 strlen(number) + 1);
2918 number[0] = '0';
2919 number[1] = 'x';
2920 len += 2;
2921 }
2922
Victor Stinner4a587072013-11-19 12:54:53 +01002923 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002924 return NULL;
2925 break;
2926 }
2927
2928 case 's':
2929 {
2930 /* UTF-8 */
2931 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002932 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002933 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002934 break;
2935 }
2936
2937 case 'U':
2938 {
2939 PyObject *obj = va_arg(*vargs, PyObject *);
2940 assert(obj && _PyUnicode_CHECK(obj));
2941
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002942 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002943 return NULL;
2944 break;
2945 }
2946
2947 case 'V':
2948 {
2949 PyObject *obj = va_arg(*vargs, PyObject *);
2950 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002951 if (obj) {
2952 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002953 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002954 return NULL;
2955 }
2956 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002957 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002958 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002959 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002960 }
2961 break;
2962 }
2963
2964 case 'S':
2965 {
2966 PyObject *obj = va_arg(*vargs, PyObject *);
2967 PyObject *str;
2968 assert(obj);
2969 str = PyObject_Str(obj);
2970 if (!str)
2971 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002972 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002973 Py_DECREF(str);
2974 return NULL;
2975 }
2976 Py_DECREF(str);
2977 break;
2978 }
2979
2980 case 'R':
2981 {
2982 PyObject *obj = va_arg(*vargs, PyObject *);
2983 PyObject *repr;
2984 assert(obj);
2985 repr = PyObject_Repr(obj);
2986 if (!repr)
2987 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002988 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002989 Py_DECREF(repr);
2990 return NULL;
2991 }
2992 Py_DECREF(repr);
2993 break;
2994 }
2995
2996 case 'A':
2997 {
2998 PyObject *obj = va_arg(*vargs, PyObject *);
2999 PyObject *ascii;
3000 assert(obj);
3001 ascii = PyObject_ASCII(obj);
3002 if (!ascii)
3003 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003004 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003005 Py_DECREF(ascii);
3006 return NULL;
3007 }
3008 Py_DECREF(ascii);
3009 break;
3010 }
3011
3012 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003013 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003014 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003015 break;
3016
3017 default:
3018 /* if we stumble upon an unknown formatting code, copy the rest
3019 of the format string to the output string. (we cannot just
3020 skip the code, since there's no way to know what's in the
3021 argument list) */
3022 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003023 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003024 return NULL;
3025 f = p+len;
3026 return f;
3027 }
3028
3029 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003030 return f;
3031}
3032
Walter Dörwaldd2034312007-05-18 16:29:38 +00003033PyObject *
3034PyUnicode_FromFormatV(const char *format, va_list vargs)
3035{
Victor Stinnere215d962012-10-06 23:03:36 +02003036 va_list vargs2;
3037 const char *f;
3038 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003039
Victor Stinner8f674cc2013-04-17 23:02:17 +02003040 _PyUnicodeWriter_Init(&writer);
3041 writer.min_length = strlen(format) + 100;
3042 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003043
Benjamin Peterson0c212142016-09-20 20:39:33 -07003044 // Copy varags to be able to pass a reference to a subfunction.
3045 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003046
3047 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003048 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003049 f = unicode_fromformat_arg(&writer, f, &vargs2);
3050 if (f == NULL)
3051 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003053 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003054 const char *p;
3055 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003056
Victor Stinnere215d962012-10-06 23:03:36 +02003057 p = f;
3058 do
3059 {
3060 if ((unsigned char)*p > 127) {
3061 PyErr_Format(PyExc_ValueError,
3062 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3063 "string, got a non-ASCII byte: 0x%02x",
3064 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003065 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003066 }
3067 p++;
3068 }
3069 while (*p != '\0' && *p != '%');
3070 len = p - f;
3071
3072 if (*p == '\0')
3073 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003074
3075 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003076 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003077
3078 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003079 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003080 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003081 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003082 return _PyUnicodeWriter_Finish(&writer);
3083
3084 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003085 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003086 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003087 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003088}
3089
Walter Dörwaldd2034312007-05-18 16:29:38 +00003090PyObject *
3091PyUnicode_FromFormat(const char *format, ...)
3092{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003093 PyObject* ret;
3094 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003095
3096#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003097 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003098#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003099 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003100#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003101 ret = PyUnicode_FromFormatV(format, vargs);
3102 va_end(vargs);
3103 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003104}
3105
Serhiy Storchakac46db922018-10-23 22:58:24 +03003106static Py_ssize_t
3107unicode_get_widechar_size(PyObject *unicode)
3108{
3109 Py_ssize_t res;
3110
3111 assert(unicode != NULL);
3112 assert(_PyUnicode_CHECK(unicode));
3113
3114 if (_PyUnicode_WSTR(unicode) != NULL) {
3115 return PyUnicode_WSTR_LENGTH(unicode);
3116 }
3117 assert(PyUnicode_IS_READY(unicode));
3118
3119 res = _PyUnicode_LENGTH(unicode);
3120#if SIZEOF_WCHAR_T == 2
3121 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3122 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3123 const Py_UCS4 *end = s + res;
3124 for (; s < end; ++s) {
3125 if (*s > 0xFFFF) {
3126 ++res;
3127 }
3128 }
3129 }
3130#endif
3131 return res;
3132}
3133
3134static void
3135unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3136{
3137 const wchar_t *wstr;
3138
3139 assert(unicode != NULL);
3140 assert(_PyUnicode_CHECK(unicode));
3141
3142 wstr = _PyUnicode_WSTR(unicode);
3143 if (wstr != NULL) {
3144 memcpy(w, wstr, size * sizeof(wchar_t));
3145 return;
3146 }
3147 assert(PyUnicode_IS_READY(unicode));
3148
3149 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3150 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3151 for (; size--; ++s, ++w) {
3152 *w = *s;
3153 }
3154 }
3155 else {
3156#if SIZEOF_WCHAR_T == 4
3157 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3158 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3159 for (; size--; ++s, ++w) {
3160 *w = *s;
3161 }
3162#else
3163 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3164 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3165 for (; size--; ++s, ++w) {
3166 Py_UCS4 ch = *s;
3167 if (ch > 0xFFFF) {
3168 assert(ch <= MAX_UNICODE);
3169 /* encode surrogate pair in this case */
3170 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3171 if (!size--)
3172 break;
3173 *w = Py_UNICODE_LOW_SURROGATE(ch);
3174 }
3175 else {
3176 *w = ch;
3177 }
3178 }
3179#endif
3180 }
3181}
3182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003183#ifdef HAVE_WCHAR_H
3184
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003185/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003186
Victor Stinnerd88d9832011-09-06 02:00:05 +02003187 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003188 character) required to convert the unicode object. Ignore size argument.
3189
Victor Stinnerd88d9832011-09-06 02:00:05 +02003190 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003191 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003192 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003193Py_ssize_t
3194PyUnicode_AsWideChar(PyObject *unicode,
3195 wchar_t *w,
3196 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003197{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003198 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003199
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003200 if (unicode == NULL) {
3201 PyErr_BadInternalCall();
3202 return -1;
3203 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003204 if (!PyUnicode_Check(unicode)) {
3205 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003206 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003207 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003208
3209 res = unicode_get_widechar_size(unicode);
3210 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003211 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003212 }
3213
3214 if (size > res) {
3215 size = res + 1;
3216 }
3217 else {
3218 res = size;
3219 }
3220 unicode_copy_as_widechar(unicode, w, size);
3221 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003222}
3223
Victor Stinner137c34c2010-09-29 10:25:54 +00003224wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003225PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003226 Py_ssize_t *size)
3227{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003228 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003229 Py_ssize_t buflen;
3230
3231 if (unicode == NULL) {
3232 PyErr_BadInternalCall();
3233 return NULL;
3234 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003235 if (!PyUnicode_Check(unicode)) {
3236 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003237 return NULL;
3238 }
3239
Serhiy Storchakac46db922018-10-23 22:58:24 +03003240 buflen = unicode_get_widechar_size(unicode);
3241 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003242 if (buffer == NULL) {
3243 PyErr_NoMemory();
3244 return NULL;
3245 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003246 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3247 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003248 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003249 }
3250 else if (wcslen(buffer) != (size_t)buflen) {
3251 PyMem_FREE(buffer);
3252 PyErr_SetString(PyExc_ValueError,
3253 "embedded null character");
3254 return NULL;
3255 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003256 return buffer;
3257}
3258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003259#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260
Alexander Belopolsky40018472011-02-26 01:02:56 +00003261PyObject *
3262PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003263{
Victor Stinner8faf8212011-12-08 22:14:11 +01003264 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003265 PyErr_SetString(PyExc_ValueError,
3266 "chr() arg not in range(0x110000)");
3267 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003268 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003269
Victor Stinner985a82a2014-01-03 12:53:47 +01003270 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003271}
3272
Alexander Belopolsky40018472011-02-26 01:02:56 +00003273PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003274PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003276 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003277 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003278 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003279 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003280 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 Py_INCREF(obj);
3282 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003283 }
3284 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 /* For a Unicode subtype that's not a Unicode object,
3286 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003287 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003288 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003289 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003290 "Can't convert '%.100s' object to str implicitly",
3291 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003292 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003293}
3294
Alexander Belopolsky40018472011-02-26 01:02:56 +00003295PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003296PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003297 const char *encoding,
3298 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003299{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003300 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003301 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003302
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003304 PyErr_BadInternalCall();
3305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003307
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003308 /* Decoding bytes objects is the most common case and should be fast */
3309 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003310 if (PyBytes_GET_SIZE(obj) == 0) {
3311 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3312 return NULL;
3313 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003314 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003315 }
3316 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003317 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3318 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003319 }
3320
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003321 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003322 PyErr_SetString(PyExc_TypeError,
3323 "decoding str is not supported");
3324 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003325 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003326
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003327 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3328 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3329 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003330 "decoding to str: need a bytes-like object, %.80s found",
3331 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003332 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003333 }
Tim Petersced69f82003-09-16 20:30:58 +00003334
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003335 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003336 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003337 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3338 return NULL;
3339 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003340 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003342
Serhiy Storchaka05997252013-01-26 12:14:02 +02003343 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003344 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003345 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346}
3347
Victor Stinnerebe17e02016-10-12 13:57:45 +02003348/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3349 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3350 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003351int
3352_Py_normalize_encoding(const char *encoding,
3353 char *lower,
3354 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003356 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003357 char *l;
3358 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003359 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360
Victor Stinner942889a2016-09-05 15:40:10 -07003361 assert(encoding != NULL);
3362
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003363 e = encoding;
3364 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003365 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003366 punct = 0;
3367 while (1) {
3368 char c = *e;
3369 if (c == 0) {
3370 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003371 }
Victor Stinner942889a2016-09-05 15:40:10 -07003372
3373 if (Py_ISALNUM(c) || c == '.') {
3374 if (punct && l != lower) {
3375 if (l == l_end) {
3376 return 0;
3377 }
3378 *l++ = '_';
3379 }
3380 punct = 0;
3381
3382 if (l == l_end) {
3383 return 0;
3384 }
3385 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003386 }
3387 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003388 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003389 }
Victor Stinner942889a2016-09-05 15:40:10 -07003390
3391 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003392 }
3393 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003394 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003395}
3396
Alexander Belopolsky40018472011-02-26 01:02:56 +00003397PyObject *
3398PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003399 Py_ssize_t size,
3400 const char *encoding,
3401 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003402{
3403 PyObject *buffer = NULL, *unicode;
3404 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003405 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3406
Victor Stinner22eb6892019-06-26 00:51:05 +02003407 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3408 return NULL;
3409 }
3410
Victor Stinnered076ed2019-06-26 01:49:32 +02003411 if (size == 0) {
3412 _Py_RETURN_UNICODE_EMPTY();
3413 }
3414
Victor Stinner942889a2016-09-05 15:40:10 -07003415 if (encoding == NULL) {
3416 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3417 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003418
Fred Drakee4315f52000-05-09 19:53:39 +00003419 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003420 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3421 char *lower = buflower;
3422
3423 /* Fast paths */
3424 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3425 lower += 3;
3426 if (*lower == '_') {
3427 /* Match "utf8" and "utf_8" */
3428 lower++;
3429 }
3430
3431 if (lower[0] == '8' && lower[1] == 0) {
3432 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3433 }
3434 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3435 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3436 }
3437 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3438 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3439 }
3440 }
3441 else {
3442 if (strcmp(lower, "ascii") == 0
3443 || strcmp(lower, "us_ascii") == 0) {
3444 return PyUnicode_DecodeASCII(s, size, errors);
3445 }
Steve Dowercc16be82016-09-08 10:35:16 -07003446 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003447 else if (strcmp(lower, "mbcs") == 0) {
3448 return PyUnicode_DecodeMBCS(s, size, errors);
3449 }
3450 #endif
3451 else if (strcmp(lower, "latin1") == 0
3452 || strcmp(lower, "latin_1") == 0
3453 || strcmp(lower, "iso_8859_1") == 0
3454 || strcmp(lower, "iso8859_1") == 0) {
3455 return PyUnicode_DecodeLatin1(s, size, errors);
3456 }
3457 }
Victor Stinner37296e82010-06-10 13:36:23 +00003458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459
3460 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003461 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003462 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003463 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003464 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 if (buffer == NULL)
3466 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003467 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 if (unicode == NULL)
3469 goto onError;
3470 if (!PyUnicode_Check(unicode)) {
3471 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003472 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003473 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003474 encoding,
3475 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 Py_DECREF(unicode);
3477 goto onError;
3478 }
3479 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003480 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003481
Benjamin Peterson29060642009-01-31 22:14:21 +00003482 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 Py_XDECREF(buffer);
3484 return NULL;
3485}
3486
Alexander Belopolsky40018472011-02-26 01:02:56 +00003487PyObject *
3488PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003489 const char *encoding,
3490 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003491{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003492 if (!PyUnicode_Check(unicode)) {
3493 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003494 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003495 }
3496
Serhiy Storchaka00939072016-10-27 21:05:49 +03003497 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3498 "PyUnicode_AsDecodedObject() is deprecated; "
3499 "use PyCodec_Decode() to decode from str", 1) < 0)
3500 return NULL;
3501
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003502 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003503 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003504
3505 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003506 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003507}
3508
Alexander Belopolsky40018472011-02-26 01:02:56 +00003509PyObject *
3510PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003511 const char *encoding,
3512 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003513{
3514 PyObject *v;
3515
3516 if (!PyUnicode_Check(unicode)) {
3517 PyErr_BadArgument();
3518 goto onError;
3519 }
3520
Serhiy Storchaka00939072016-10-27 21:05:49 +03003521 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3522 "PyUnicode_AsDecodedUnicode() is deprecated; "
3523 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3524 return NULL;
3525
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003526 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003527 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003528
3529 /* Decode via the codec registry */
3530 v = PyCodec_Decode(unicode, encoding, errors);
3531 if (v == NULL)
3532 goto onError;
3533 if (!PyUnicode_Check(v)) {
3534 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003535 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003536 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003537 encoding,
3538 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003539 Py_DECREF(v);
3540 goto onError;
3541 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003542 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003543
Benjamin Peterson29060642009-01-31 22:14:21 +00003544 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003545 return NULL;
3546}
3547
Alexander Belopolsky40018472011-02-26 01:02:56 +00003548PyObject *
3549PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003550 Py_ssize_t size,
3551 const char *encoding,
3552 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553{
3554 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003555
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003556 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3560 Py_DECREF(unicode);
3561 return v;
3562}
3563
Alexander Belopolsky40018472011-02-26 01:02:56 +00003564PyObject *
3565PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003566 const char *encoding,
3567 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003568{
3569 PyObject *v;
3570
3571 if (!PyUnicode_Check(unicode)) {
3572 PyErr_BadArgument();
3573 goto onError;
3574 }
3575
Serhiy Storchaka00939072016-10-27 21:05:49 +03003576 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3577 "PyUnicode_AsEncodedObject() is deprecated; "
3578 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3579 "or PyCodec_Encode() for generic encoding", 1) < 0)
3580 return NULL;
3581
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003582 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003583 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003584
3585 /* Encode via the codec registry */
3586 v = PyCodec_Encode(unicode, encoding, errors);
3587 if (v == NULL)
3588 goto onError;
3589 return v;
3590
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003592 return NULL;
3593}
3594
Victor Stinner1b579672011-12-17 05:47:23 +01003595
Victor Stinner2cba6b82018-01-10 22:46:15 +01003596static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003597unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003598 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003599{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003600 Py_ssize_t wlen;
3601 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3602 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003603 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003604 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003605
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003606 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003607 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003608 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003609 return NULL;
3610 }
3611
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003612 char *str;
3613 size_t error_pos;
3614 const char *reason;
3615 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003616 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003617 PyMem_Free(wstr);
3618
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003619 if (res != 0) {
3620 if (res == -2) {
3621 PyObject *exc;
3622 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3623 "locale", unicode,
3624 (Py_ssize_t)error_pos,
3625 (Py_ssize_t)(error_pos+1),
3626 reason);
3627 if (exc != NULL) {
3628 PyCodec_StrictErrors(exc);
3629 Py_DECREF(exc);
3630 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003631 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003632 else if (res == -3) {
3633 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3634 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003635 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003636 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003637 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003638 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003639 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003640
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003641 PyObject *bytes = PyBytes_FromString(str);
3642 PyMem_RawFree(str);
3643 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003644}
3645
Victor Stinnerad158722010-10-27 00:25:46 +00003646PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003647PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3648{
Victor Stinner709d23d2019-05-02 14:56:30 -04003649 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3650 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003651}
3652
3653PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003654PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003655{
Victor Stinner81a7be32020-04-14 15:14:01 +02003656 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003657 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3658 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003659 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003660 fs_codec->error_handler,
3661 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003662 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003663#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003664 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003665 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003666 fs_codec->encoding,
3667 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003668 }
Victor Stinnerad158722010-10-27 00:25:46 +00003669#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003670 else {
3671 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3672 machinery is not ready and so cannot be used:
3673 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003674 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3675 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003676 assert(filesystem_errors != NULL);
3677 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3678 assert(errors != _Py_ERROR_UNKNOWN);
3679#ifdef _Py_FORCE_UTF8_FS_ENCODING
3680 return unicode_encode_utf8(unicode, errors, NULL);
3681#else
3682 return unicode_encode_locale(unicode, errors, 0);
3683#endif
3684 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003685}
3686
Alexander Belopolsky40018472011-02-26 01:02:56 +00003687PyObject *
3688PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003689 const char *encoding,
3690 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691{
3692 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003693 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003694
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 if (!PyUnicode_Check(unicode)) {
3696 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003697 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 }
Fred Drakee4315f52000-05-09 19:53:39 +00003699
Victor Stinner22eb6892019-06-26 00:51:05 +02003700 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3701 return NULL;
3702 }
3703
Victor Stinner942889a2016-09-05 15:40:10 -07003704 if (encoding == NULL) {
3705 return _PyUnicode_AsUTF8String(unicode, errors);
3706 }
3707
Fred Drakee4315f52000-05-09 19:53:39 +00003708 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003709 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3710 char *lower = buflower;
3711
3712 /* Fast paths */
3713 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3714 lower += 3;
3715 if (*lower == '_') {
3716 /* Match "utf8" and "utf_8" */
3717 lower++;
3718 }
3719
3720 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003722 }
3723 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3724 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3725 }
3726 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3727 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3728 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003729 }
Victor Stinner942889a2016-09-05 15:40:10 -07003730 else {
3731 if (strcmp(lower, "ascii") == 0
3732 || strcmp(lower, "us_ascii") == 0) {
3733 return _PyUnicode_AsASCIIString(unicode, errors);
3734 }
Steve Dowercc16be82016-09-08 10:35:16 -07003735#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003736 else if (strcmp(lower, "mbcs") == 0) {
3737 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3738 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003739#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003740 else if (strcmp(lower, "latin1") == 0 ||
3741 strcmp(lower, "latin_1") == 0 ||
3742 strcmp(lower, "iso_8859_1") == 0 ||
3743 strcmp(lower, "iso8859_1") == 0) {
3744 return _PyUnicode_AsLatin1String(unicode, errors);
3745 }
3746 }
Victor Stinner37296e82010-06-10 13:36:23 +00003747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748
3749 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003750 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003752 return NULL;
3753
3754 /* The normal path */
3755 if (PyBytes_Check(v))
3756 return v;
3757
3758 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003759 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003760 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003761 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003762
3763 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003764 "encoder %s returned bytearray instead of bytes; "
3765 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003766 encoding);
3767 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003768 Py_DECREF(v);
3769 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003770 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003771
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003772 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3773 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003774 Py_DECREF(v);
3775 return b;
3776 }
3777
3778 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003779 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003780 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003781 encoding,
3782 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003783 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003784 return NULL;
3785}
3786
Alexander Belopolsky40018472011-02-26 01:02:56 +00003787PyObject *
3788PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003789 const char *encoding,
3790 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003791{
3792 PyObject *v;
3793
3794 if (!PyUnicode_Check(unicode)) {
3795 PyErr_BadArgument();
3796 goto onError;
3797 }
3798
Serhiy Storchaka00939072016-10-27 21:05:49 +03003799 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3800 "PyUnicode_AsEncodedUnicode() is deprecated; "
3801 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3802 return NULL;
3803
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003804 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003805 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003806
3807 /* Encode via the codec registry */
3808 v = PyCodec_Encode(unicode, encoding, errors);
3809 if (v == NULL)
3810 goto onError;
3811 if (!PyUnicode_Check(v)) {
3812 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003813 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003814 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003815 encoding,
3816 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003817 Py_DECREF(v);
3818 goto onError;
3819 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003821
Benjamin Peterson29060642009-01-31 22:14:21 +00003822 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 return NULL;
3824}
3825
Victor Stinner2cba6b82018-01-10 22:46:15 +01003826static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003827unicode_decode_locale(const char *str, Py_ssize_t len,
3828 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003829{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003830 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3831 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003832 return NULL;
3833 }
3834
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003835 wchar_t *wstr;
3836 size_t wlen;
3837 const char *reason;
3838 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003839 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003840 if (res != 0) {
3841 if (res == -2) {
3842 PyObject *exc;
3843 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3844 "locale", str, len,
3845 (Py_ssize_t)wlen,
3846 (Py_ssize_t)(wlen + 1),
3847 reason);
3848 if (exc != NULL) {
3849 PyCodec_StrictErrors(exc);
3850 Py_DECREF(exc);
3851 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003852 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003853 else if (res == -3) {
3854 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3855 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003856 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003857 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003858 }
Victor Stinner2f197072011-12-17 07:08:30 +01003859 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003860 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003861
3862 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3863 PyMem_RawFree(wstr);
3864 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003865}
3866
3867PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003868PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3869 const char *errors)
3870{
Victor Stinner709d23d2019-05-02 14:56:30 -04003871 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3872 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003873}
3874
3875PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003876PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003877{
3878 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003879 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3880 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003881}
3882
3883
3884PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003885PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003886 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003887 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3888}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003889
Christian Heimes5894ba72007-11-04 11:43:14 +00003890PyObject*
3891PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3892{
Victor Stinner81a7be32020-04-14 15:14:01 +02003893 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003894 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3895 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003896 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003897 fs_codec->error_handler,
3898 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04003899 NULL);
3900 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003901#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003902 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003903 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003904 fs_codec->encoding,
3905 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003906 }
Victor Stinnerad158722010-10-27 00:25:46 +00003907#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003908 else {
3909 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3910 machinery is not ready and so cannot be used:
3911 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003912 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3913 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003914 assert(filesystem_errors != NULL);
3915 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3916 assert(errors != _Py_ERROR_UNKNOWN);
3917#ifdef _Py_FORCE_UTF8_FS_ENCODING
3918 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3919#else
3920 return unicode_decode_locale(s, size, errors, 0);
3921#endif
3922 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003923}
3924
Martin v. Löwis011e8422009-05-05 04:43:17 +00003925
3926int
3927PyUnicode_FSConverter(PyObject* arg, void* addr)
3928{
Brett Cannonec6ce872016-09-06 15:50:29 -07003929 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003930 PyObject *output = NULL;
3931 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003932 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003933 if (arg == NULL) {
3934 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003935 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003936 return 1;
3937 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003938 path = PyOS_FSPath(arg);
3939 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003940 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003941 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003942 if (PyBytes_Check(path)) {
3943 output = path;
3944 }
3945 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3946 output = PyUnicode_EncodeFSDefault(path);
3947 Py_DECREF(path);
3948 if (!output) {
3949 return 0;
3950 }
3951 assert(PyBytes_Check(output));
3952 }
3953
Victor Stinner0ea2a462010-04-30 00:22:08 +00003954 size = PyBytes_GET_SIZE(output);
3955 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003956 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003957 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003958 Py_DECREF(output);
3959 return 0;
3960 }
3961 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003962 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003963}
3964
3965
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003966int
3967PyUnicode_FSDecoder(PyObject* arg, void* addr)
3968{
Brett Cannona5711202016-09-06 19:36:01 -07003969 int is_buffer = 0;
3970 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003971 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003972 if (arg == NULL) {
3973 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003974 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003975 return 1;
3976 }
Brett Cannona5711202016-09-06 19:36:01 -07003977
3978 is_buffer = PyObject_CheckBuffer(arg);
3979 if (!is_buffer) {
3980 path = PyOS_FSPath(arg);
3981 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003982 return 0;
3983 }
Brett Cannona5711202016-09-06 19:36:01 -07003984 }
3985 else {
3986 path = arg;
3987 Py_INCREF(arg);
3988 }
3989
3990 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003991 output = path;
3992 }
3993 else if (PyBytes_Check(path) || is_buffer) {
3994 PyObject *path_bytes = NULL;
3995
3996 if (!PyBytes_Check(path) &&
3997 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003998 "path should be string, bytes, or os.PathLike, not %.200s",
3999 Py_TYPE(arg)->tp_name)) {
4000 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004001 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004002 }
4003 path_bytes = PyBytes_FromObject(path);
4004 Py_DECREF(path);
4005 if (!path_bytes) {
4006 return 0;
4007 }
4008 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4009 PyBytes_GET_SIZE(path_bytes));
4010 Py_DECREF(path_bytes);
4011 if (!output) {
4012 return 0;
4013 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004014 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004015 else {
4016 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004017 "path should be string, bytes, or os.PathLike, not %.200s",
4018 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004019 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004020 return 0;
4021 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004022 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004023 Py_DECREF(output);
4024 return 0;
4025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004026 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004027 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004028 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004029 Py_DECREF(output);
4030 return 0;
4031 }
4032 *(PyObject**)addr = output;
4033 return Py_CLEANUP_SUPPORTED;
4034}
4035
4036
Inada Naoki02a4d572020-02-27 13:48:59 +09004037static int unicode_fill_utf8(PyObject *unicode);
4038
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004039const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004041{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004042 if (!PyUnicode_Check(unicode)) {
4043 PyErr_BadArgument();
4044 return NULL;
4045 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004046 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004047 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004049 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004050 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 return NULL;
4052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 }
4054
4055 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004056 *psize = PyUnicode_UTF8_LENGTH(unicode);
4057 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004058}
4059
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004060const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4064}
4065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066Py_UNICODE *
4067PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4068{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 if (!PyUnicode_Check(unicode)) {
4070 PyErr_BadArgument();
4071 return NULL;
4072 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004073 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4074 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004075 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004076 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004077 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078
Serhiy Storchakac46db922018-10-23 22:58:24 +03004079 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4080 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4081 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004084 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4085 if (w == NULL) {
4086 PyErr_NoMemory();
4087 return NULL;
4088 }
4089 unicode_copy_as_widechar(unicode, w, wlen + 1);
4090 _PyUnicode_WSTR(unicode) = w;
4091 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4092 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 }
4094 }
4095 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004096 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004097 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004098}
4099
Alexander Belopolsky40018472011-02-26 01:02:56 +00004100Py_UNICODE *
4101PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004103 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104}
4105
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004106const Py_UNICODE *
4107_PyUnicode_AsUnicode(PyObject *unicode)
4108{
4109 Py_ssize_t size;
4110 const Py_UNICODE *wstr;
4111
4112 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4113 if (wstr && wcslen(wstr) != (size_t)size) {
4114 PyErr_SetString(PyExc_ValueError, "embedded null character");
4115 return NULL;
4116 }
4117 return wstr;
4118}
4119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120
Alexander Belopolsky40018472011-02-26 01:02:56 +00004121Py_ssize_t
4122PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123{
4124 if (!PyUnicode_Check(unicode)) {
4125 PyErr_BadArgument();
4126 goto onError;
4127 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004128 if (_PyUnicode_WSTR(unicode) == NULL) {
4129 if (PyUnicode_AsUnicode(unicode) == NULL)
4130 goto onError;
4131 }
4132 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 return -1;
4136}
4137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138Py_ssize_t
4139PyUnicode_GetLength(PyObject *unicode)
4140{
Victor Stinner07621332012-06-16 04:53:46 +02004141 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004142 PyErr_BadArgument();
4143 return -1;
4144 }
Victor Stinner07621332012-06-16 04:53:46 +02004145 if (PyUnicode_READY(unicode) == -1)
4146 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147 return PyUnicode_GET_LENGTH(unicode);
4148}
4149
4150Py_UCS4
4151PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4152{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004153 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004154 int kind;
4155
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004156 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004157 PyErr_BadArgument();
4158 return (Py_UCS4)-1;
4159 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004160 if (PyUnicode_READY(unicode) == -1) {
4161 return (Py_UCS4)-1;
4162 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004163 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004164 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165 return (Py_UCS4)-1;
4166 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004167 data = PyUnicode_DATA(unicode);
4168 kind = PyUnicode_KIND(unicode);
4169 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004170}
4171
4172int
4173PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4174{
4175 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004176 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004177 return -1;
4178 }
Victor Stinner488fa492011-12-12 00:01:39 +01004179 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004180 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004181 PyErr_SetString(PyExc_IndexError, "string index out of range");
4182 return -1;
4183 }
Victor Stinner488fa492011-12-12 00:01:39 +01004184 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004185 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004186 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4187 PyErr_SetString(PyExc_ValueError, "character out of range");
4188 return -1;
4189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004190 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4191 index, ch);
4192 return 0;
4193}
4194
Alexander Belopolsky40018472011-02-26 01:02:56 +00004195const char *
4196PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004197{
Victor Stinner42cb4622010-09-01 19:39:01 +00004198 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004199}
4200
Victor Stinner554f3f02010-06-16 23:33:54 +00004201/* create or adjust a UnicodeDecodeError */
4202static void
4203make_decode_exception(PyObject **exceptionObject,
4204 const char *encoding,
4205 const char *input, Py_ssize_t length,
4206 Py_ssize_t startpos, Py_ssize_t endpos,
4207 const char *reason)
4208{
4209 if (*exceptionObject == NULL) {
4210 *exceptionObject = PyUnicodeDecodeError_Create(
4211 encoding, input, length, startpos, endpos, reason);
4212 }
4213 else {
4214 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4215 goto onError;
4216 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4217 goto onError;
4218 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4219 goto onError;
4220 }
4221 return;
4222
4223onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004224 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004225}
4226
Steve Dowercc16be82016-09-08 10:35:16 -07004227#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004228static int
4229widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4230{
4231 if (newsize > *size) {
4232 wchar_t *newbuf = *buf;
4233 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4234 PyErr_NoMemory();
4235 return -1;
4236 }
4237 *buf = newbuf;
4238 }
4239 *size = newsize;
4240 return 0;
4241}
4242
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243/* error handling callback helper:
4244 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004245 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246 and adjust various state variables.
4247 return 0 on success, -1 on error
4248*/
4249
Alexander Belopolsky40018472011-02-26 01:02:56 +00004250static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251unicode_decode_call_errorhandler_wchar(
4252 const char *errors, PyObject **errorHandler,
4253 const char *encoding, const char *reason,
4254 const char **input, const char **inend, Py_ssize_t *startinpos,
4255 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004256 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004258 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259
4260 PyObject *restuple = NULL;
4261 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004262 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004263 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004264 Py_ssize_t requiredsize;
4265 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004266 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004267 wchar_t *repwstr;
4268 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269
4270 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 *errorHandler = PyCodec_LookupError(errors);
4272 if (*errorHandler == NULL)
4273 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 }
4275
Victor Stinner554f3f02010-06-16 23:33:54 +00004276 make_decode_exception(exceptionObject,
4277 encoding,
4278 *input, *inend - *input,
4279 *startinpos, *endinpos,
4280 reason);
4281 if (*exceptionObject == NULL)
4282 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283
Petr Viktorinffd97532020-02-11 17:46:57 +01004284 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004287 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004288 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004291 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004293
4294 /* Copy back the bytes variables, which might have been modified by the
4295 callback */
4296 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4297 if (!inputobj)
4298 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004299 *input = PyBytes_AS_STRING(inputobj);
4300 insize = PyBytes_GET_SIZE(inputobj);
4301 *inend = *input + insize;
4302 /* we can DECREF safely, as the exception has another reference,
4303 so the object won't go away. */
4304 Py_DECREF(inputobj);
4305
4306 if (newpos<0)
4307 newpos = insize+newpos;
4308 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004309 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310 goto onError;
4311 }
4312
4313 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4314 if (repwstr == NULL)
4315 goto onError;
4316 /* need more space? (at least enough for what we
4317 have+the replacement+the rest of the string (starting
4318 at the new input position), so we won't have to check space
4319 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004320 requiredsize = *outpos;
4321 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4322 goto overflow;
4323 requiredsize += repwlen;
4324 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4325 goto overflow;
4326 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004327 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004329 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004331 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004332 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004333 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004334 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004335 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004336 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 *endinpos = newpos;
4338 *inptr = *input + newpos;
4339
4340 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004341 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342 return 0;
4343
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004344 overflow:
4345 PyErr_SetString(PyExc_OverflowError,
4346 "decoded result is too long for a Python string");
4347
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004348 onError:
4349 Py_XDECREF(restuple);
4350 return -1;
4351}
Steve Dowercc16be82016-09-08 10:35:16 -07004352#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004353
4354static int
4355unicode_decode_call_errorhandler_writer(
4356 const char *errors, PyObject **errorHandler,
4357 const char *encoding, const char *reason,
4358 const char **input, const char **inend, Py_ssize_t *startinpos,
4359 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4360 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4361{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004362 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004363
4364 PyObject *restuple = NULL;
4365 PyObject *repunicode = NULL;
4366 Py_ssize_t insize;
4367 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004368 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004369 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004370 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004371 int need_to_grow = 0;
4372 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004373
4374 if (*errorHandler == NULL) {
4375 *errorHandler = PyCodec_LookupError(errors);
4376 if (*errorHandler == NULL)
4377 goto onError;
4378 }
4379
4380 make_decode_exception(exceptionObject,
4381 encoding,
4382 *input, *inend - *input,
4383 *startinpos, *endinpos,
4384 reason);
4385 if (*exceptionObject == NULL)
4386 goto onError;
4387
Petr Viktorinffd97532020-02-11 17:46:57 +01004388 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004389 if (restuple == NULL)
4390 goto onError;
4391 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004392 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004393 goto onError;
4394 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004395 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004396 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004397
4398 /* Copy back the bytes variables, which might have been modified by the
4399 callback */
4400 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4401 if (!inputobj)
4402 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004403 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004404 *input = PyBytes_AS_STRING(inputobj);
4405 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004406 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004407 /* we can DECREF safely, as the exception has another reference,
4408 so the object won't go away. */
4409 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004413 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004414 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004415 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004416 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417
Victor Stinner170ca6f2013-04-18 00:25:28 +02004418 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004419 if (replen > 1) {
4420 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004421 need_to_grow = 1;
4422 }
4423 new_inptr = *input + newpos;
4424 if (*inend - new_inptr > remain) {
4425 /* We don't know the decoding algorithm here so we make the worst
4426 assumption that one byte decodes to one unicode character.
4427 If unfortunately one byte could decode to more unicode characters,
4428 the decoder may write out-of-bound then. Is it possible for the
4429 algorithms using this function? */
4430 writer->min_length += *inend - new_inptr - remain;
4431 need_to_grow = 1;
4432 }
4433 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004434 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004435 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004436 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4437 goto onError;
4438 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004439 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004440 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004443 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004446 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004447 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004451 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452}
4453
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004454/* --- UTF-7 Codec -------------------------------------------------------- */
4455
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456/* See RFC2152 for details. We encode conservatively and decode liberally. */
4457
4458/* Three simple macros defining base-64. */
4459
4460/* Is c a base-64 character? */
4461
4462#define IS_BASE64(c) \
4463 (((c) >= 'A' && (c) <= 'Z') || \
4464 ((c) >= 'a' && (c) <= 'z') || \
4465 ((c) >= '0' && (c) <= '9') || \
4466 (c) == '+' || (c) == '/')
4467
4468/* given that c is a base-64 character, what is its base-64 value? */
4469
4470#define FROM_BASE64(c) \
4471 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4472 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4473 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4474 (c) == '+' ? 62 : 63)
4475
4476/* What is the base-64 character of the bottom 6 bits of n? */
4477
4478#define TO_BASE64(n) \
4479 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4480
4481/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4482 * decoded as itself. We are permissive on decoding; the only ASCII
4483 * byte not decoding to itself is the + which begins a base64
4484 * string. */
4485
4486#define DECODE_DIRECT(c) \
4487 ((c) <= 127 && (c) != '+')
4488
4489/* The UTF-7 encoder treats ASCII characters differently according to
4490 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4491 * the above). See RFC2152. This array identifies these different
4492 * sets:
4493 * 0 : "Set D"
4494 * alphanumeric and '(),-./:?
4495 * 1 : "Set O"
4496 * !"#$%&*;<=>@[]^_`{|}
4497 * 2 : "whitespace"
4498 * ht nl cr sp
4499 * 3 : special (must be base64 encoded)
4500 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4501 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502
Tim Petersced69f82003-09-16 20:30:58 +00004503static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504char utf7_category[128] = {
4505/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4506 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4507/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4508 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4509/* sp ! " # $ % & ' ( ) * + , - . / */
4510 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4511/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4512 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4513/* @ A B C D E F G H I J K L M N O */
4514 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4515/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4517/* ` a b c d e f g h i j k l m n o */
4518 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4519/* p q r s t u v w x y z { | } ~ del */
4520 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521};
4522
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523/* ENCODE_DIRECT: this character should be encoded as itself. The
4524 * answer depends on whether we are encoding set O as itself, and also
4525 * on whether we are encoding whitespace as itself. RFC2152 makes it
4526 * clear that the answers to these questions vary between
4527 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004528
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529#define ENCODE_DIRECT(c, directO, directWS) \
4530 ((c) < 128 && (c) > 0 && \
4531 ((utf7_category[(c)] == 0) || \
4532 (directWS && (utf7_category[(c)] == 2)) || \
4533 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004534
Alexander Belopolsky40018472011-02-26 01:02:56 +00004535PyObject *
4536PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004537 Py_ssize_t size,
4538 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004539{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004540 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4541}
4542
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543/* The decoder. The only state we preserve is our read position,
4544 * i.e. how many characters we have consumed. So if we end in the
4545 * middle of a shift sequence we have to back off the read position
4546 * and the output to the beginning of the sequence, otherwise we lose
4547 * all the shift state (seen bits, number of bits seen, high
4548 * surrogate). */
4549
Alexander Belopolsky40018472011-02-26 01:02:56 +00004550PyObject *
4551PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004552 Py_ssize_t size,
4553 const char *errors,
4554 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004555{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004557 Py_ssize_t startinpos;
4558 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004559 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004560 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561 const char *errmsg = "";
4562 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004563 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 unsigned int base64bits = 0;
4565 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004566 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 PyObject *errorHandler = NULL;
4568 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004570 if (size == 0) {
4571 if (consumed)
4572 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004573 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004574 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004576 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004577 _PyUnicodeWriter_Init(&writer);
4578 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004579
4580 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581 e = s + size;
4582
4583 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004584 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004585 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004586 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004587
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588 if (inShift) { /* in a base-64 section */
4589 if (IS_BASE64(ch)) { /* consume a base-64 character */
4590 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4591 base64bits += 6;
4592 s++;
4593 if (base64bits >= 16) {
4594 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004595 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 base64bits -= 16;
4597 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004598 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 if (surrogate) {
4600 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004601 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4602 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004603 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004604 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004606 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 }
4608 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004609 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004610 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 }
4613 }
Victor Stinner551ac952011-11-29 22:58:13 +01004614 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 /* first surrogate */
4616 surrogate = outCh;
4617 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004619 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004620 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621 }
4622 }
4623 }
4624 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004626 if (base64bits > 0) { /* left-over bits */
4627 if (base64bits >= 6) {
4628 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004629 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004630 errmsg = "partial character in shift sequence";
4631 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 else {
4634 /* Some bits remain; they should be zero */
4635 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004636 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004637 errmsg = "non-zero padding bits in shift sequence";
4638 goto utf7Error;
4639 }
4640 }
4641 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004642 if (surrogate && DECODE_DIRECT(ch)) {
4643 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4644 goto onError;
4645 }
4646 surrogate = 0;
4647 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004648 /* '-' is absorbed; other terminating
4649 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004650 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004651 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 }
4653 }
4654 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004656 s++; /* consume '+' */
4657 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004658 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004659 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004660 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004662 else if (s < e && !IS_BASE64(*s)) {
4663 s++;
4664 errmsg = "ill-formed sequence";
4665 goto utf7Error;
4666 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004669 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004670 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004672 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004673 }
4674 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004675 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004676 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004677 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004678 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004679 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004680 else {
4681 startinpos = s-starts;
4682 s++;
4683 errmsg = "unexpected special character";
4684 goto utf7Error;
4685 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004686 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004689 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 errors, &errorHandler,
4691 "utf7", errmsg,
4692 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004693 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 }
4696
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 /* end of string */
4698
4699 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4700 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004701 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 if (surrogate ||
4703 (base64bits >= 6) ||
4704 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004706 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707 errors, &errorHandler,
4708 "utf7", "unterminated shift sequence",
4709 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004710 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004711 goto onError;
4712 if (s < e)
4713 goto restart;
4714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004716
4717 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004718 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004719 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004720 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004721 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004722 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004723 writer.kind, writer.data, shiftOutStart);
4724 Py_XDECREF(errorHandler);
4725 Py_XDECREF(exc);
4726 _PyUnicodeWriter_Dealloc(&writer);
4727 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004728 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004729 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730 }
4731 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004732 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004733 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004734 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 Py_XDECREF(errorHandler);
4737 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004738 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004739
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004741 Py_XDECREF(errorHandler);
4742 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004743 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004744 return NULL;
4745}
4746
4747
Alexander Belopolsky40018472011-02-26 01:02:56 +00004748PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004749_PyUnicode_EncodeUTF7(PyObject *str,
4750 int base64SetO,
4751 int base64WhiteSpace,
4752 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004754 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004755 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004756 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004757 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004759 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004760 unsigned int base64bits = 0;
4761 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004762 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004763 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764
Benjamin Petersonbac79492012-01-14 13:34:47 -05004765 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004766 return NULL;
4767 kind = PyUnicode_KIND(str);
4768 data = PyUnicode_DATA(str);
4769 len = PyUnicode_GET_LENGTH(str);
4770
4771 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004772 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004773
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004774 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004775 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004776 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004777 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004778 if (v == NULL)
4779 return NULL;
4780
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004781 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004782 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004783 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004784
Antoine Pitrou244651a2009-05-04 18:56:13 +00004785 if (inShift) {
4786 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4787 /* shifting out */
4788 if (base64bits) { /* output remaining bits */
4789 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4790 base64buffer = 0;
4791 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004792 }
4793 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 /* Characters not in the BASE64 set implicitly unshift the sequence
4795 so no '-' is required, except if the character is itself a '-' */
4796 if (IS_BASE64(ch) || ch == '-') {
4797 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004798 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004799 *out++ = (char) ch;
4800 }
4801 else {
4802 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004803 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004804 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004805 else { /* not in a shift sequence */
4806 if (ch == '+') {
4807 *out++ = '+';
4808 *out++ = '-';
4809 }
4810 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4811 *out++ = (char) ch;
4812 }
4813 else {
4814 *out++ = '+';
4815 inShift = 1;
4816 goto encode_char;
4817 }
4818 }
4819 continue;
4820encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004822 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004823
Antoine Pitrou244651a2009-05-04 18:56:13 +00004824 /* code first surrogate */
4825 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004826 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827 while (base64bits >= 6) {
4828 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4829 base64bits -= 6;
4830 }
4831 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004832 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004833 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004834 base64bits += 16;
4835 base64buffer = (base64buffer << 16) | ch;
4836 while (base64bits >= 6) {
4837 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4838 base64bits -= 6;
4839 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004840 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004841 if (base64bits)
4842 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4843 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004844 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004845 if (_PyBytes_Resize(&v, out - start) < 0)
4846 return NULL;
4847 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004848}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004849PyObject *
4850PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4851 Py_ssize_t size,
4852 int base64SetO,
4853 int base64WhiteSpace,
4854 const char *errors)
4855{
4856 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004857 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004858 if (tmp == NULL)
4859 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004860 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004861 base64WhiteSpace, errors);
4862 Py_DECREF(tmp);
4863 return result;
4864}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004865
Antoine Pitrou244651a2009-05-04 18:56:13 +00004866#undef IS_BASE64
4867#undef FROM_BASE64
4868#undef TO_BASE64
4869#undef DECODE_DIRECT
4870#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004871
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872/* --- UTF-8 Codec -------------------------------------------------------- */
4873
Alexander Belopolsky40018472011-02-26 01:02:56 +00004874PyObject *
4875PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004876 Py_ssize_t size,
4877 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878{
Walter Dörwald69652032004-09-07 20:24:22 +00004879 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4880}
4881
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882#include "stringlib/asciilib.h"
4883#include "stringlib/codecs.h"
4884#include "stringlib/undef.h"
4885
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004886#include "stringlib/ucs1lib.h"
4887#include "stringlib/codecs.h"
4888#include "stringlib/undef.h"
4889
4890#include "stringlib/ucs2lib.h"
4891#include "stringlib/codecs.h"
4892#include "stringlib/undef.h"
4893
4894#include "stringlib/ucs4lib.h"
4895#include "stringlib/codecs.h"
4896#include "stringlib/undef.h"
4897
Antoine Pitrouab868312009-01-10 15:40:25 +00004898/* Mask to quickly check whether a C 'long' contains a
4899 non-ASCII, UTF8-encoded char. */
4900#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004901# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004902#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004903# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004904#else
4905# error C 'long' size should be either 4 or 8!
4906#endif
4907
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004908static Py_ssize_t
4909ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004910{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004912 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004913
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004914 /*
4915 * Issue #17237: m68k is a bit different from most architectures in
4916 * that objects do not use "natural alignment" - for example, int and
4917 * long are only aligned at 2-byte boundaries. Therefore the assert()
4918 * won't work; also, tests have shown that skipping the "optimised
4919 * version" will even speed up m68k.
4920 */
4921#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004923 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4924 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925 /* Fast path, see in STRINGLIB(utf8_decode) for
4926 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004927 /* Help allocation */
4928 const char *_p = p;
4929 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004930 while (_p < aligned_end) {
4931 unsigned long value = *(const unsigned long *) _p;
4932 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934 *((unsigned long *)q) = value;
4935 _p += SIZEOF_LONG;
4936 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004937 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938 p = _p;
4939 while (p < end) {
4940 if ((unsigned char)*p & 0x80)
4941 break;
4942 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004946#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004947#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004948 while (p < end) {
4949 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4950 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004951 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004952 /* Help allocation */
4953 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004954 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004955 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 if (value & ASCII_CHAR_MASK)
4957 break;
4958 _p += SIZEOF_LONG;
4959 }
4960 p = _p;
4961 if (_p == end)
4962 break;
4963 }
4964 if ((unsigned char)*p & 0x80)
4965 break;
4966 ++p;
4967 }
4968 memcpy(dest, start, p - start);
4969 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970}
Antoine Pitrouab868312009-01-10 15:40:25 +00004971
Victor Stinner709d23d2019-05-02 14:56:30 -04004972static PyObject *
4973unicode_decode_utf8(const char *s, Py_ssize_t size,
4974 _Py_error_handler error_handler, const char *errors,
4975 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004976{
Victor Stinner785938e2011-12-11 20:09:03 +01004977 if (size == 0) {
4978 if (consumed)
4979 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004980 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004981 }
4982
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004983 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4984 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004985 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004986 *consumed = 1;
4987 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004988 }
4989
Inada Naoki770847a2019-06-24 12:30:24 +09004990 const char *starts = s;
4991 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004992
Inada Naoki770847a2019-06-24 12:30:24 +09004993 // fast path: try ASCII string.
4994 PyObject *u = PyUnicode_New(size, 127);
4995 if (u == NULL) {
4996 return NULL;
4997 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004998 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09004999 if (s == end) {
5000 return u;
5001 }
5002
5003 // Use _PyUnicodeWriter after fast path is failed.
5004 _PyUnicodeWriter writer;
5005 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5006 writer.pos = s - starts;
5007
5008 Py_ssize_t startinpos, endinpos;
5009 const char *errmsg = "";
5010 PyObject *error_handler_obj = NULL;
5011 PyObject *exc = NULL;
5012
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 while (s < end) {
5014 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005015 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005016
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005017 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005018 if (PyUnicode_IS_ASCII(writer.buffer))
5019 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005020 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005021 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005022 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005023 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 } else {
5025 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005026 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005027 }
5028
5029 switch (ch) {
5030 case 0:
5031 if (s == end || consumed)
5032 goto End;
5033 errmsg = "unexpected end of data";
5034 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005035 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005036 break;
5037 case 1:
5038 errmsg = "invalid start byte";
5039 startinpos = s - starts;
5040 endinpos = startinpos + 1;
5041 break;
5042 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005043 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5044 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5045 {
5046 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005047 goto End;
5048 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005049 /* fall through */
5050 case 3:
5051 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005052 errmsg = "invalid continuation byte";
5053 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005054 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005055 break;
5056 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005057 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005058 goto onError;
5059 continue;
5060 }
5061
Victor Stinner1d65d912015-10-05 13:43:50 +02005062 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005063 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005064
5065 switch (error_handler) {
5066 case _Py_ERROR_IGNORE:
5067 s += (endinpos - startinpos);
5068 break;
5069
5070 case _Py_ERROR_REPLACE:
5071 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5072 goto onError;
5073 s += (endinpos - startinpos);
5074 break;
5075
5076 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005077 {
5078 Py_ssize_t i;
5079
Victor Stinner1d65d912015-10-05 13:43:50 +02005080 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5081 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005082 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005083 ch = (Py_UCS4)(unsigned char)(starts[i]);
5084 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5085 ch + 0xdc00);
5086 writer.pos++;
5087 }
5088 s += (endinpos - startinpos);
5089 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005090 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005091
5092 default:
5093 if (unicode_decode_call_errorhandler_writer(
5094 errors, &error_handler_obj,
5095 "utf-8", errmsg,
5096 &starts, &end, &startinpos, &endinpos, &exc, &s,
5097 &writer))
5098 goto onError;
5099 }
Victor Stinner785938e2011-12-11 20:09:03 +01005100 }
5101
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005102End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005103 if (consumed)
5104 *consumed = s - starts;
5105
Victor Stinner1d65d912015-10-05 13:43:50 +02005106 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005107 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005108 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005109
5110onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005111 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005112 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005113 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005114 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005115}
5116
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005117
Victor Stinner709d23d2019-05-02 14:56:30 -04005118PyObject *
5119PyUnicode_DecodeUTF8Stateful(const char *s,
5120 Py_ssize_t size,
5121 const char *errors,
5122 Py_ssize_t *consumed)
5123{
5124 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5125}
5126
5127
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005128/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5129 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005130
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005131 On success, write a pointer to a newly allocated wide character string into
5132 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5133 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005134
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005135 On memory allocation failure, return -1.
5136
5137 On decoding error (if surrogateescape is zero), return -2. If wlen is
5138 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5139 is not NULL, write the decoding error message into *reason. */
5140int
5141_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005142 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005143{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005144 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005145 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005146 wchar_t *unicode;
5147 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005148
Victor Stinner3d4226a2018-08-29 22:21:32 +02005149 int surrogateescape = 0;
5150 int surrogatepass = 0;
5151 switch (errors)
5152 {
5153 case _Py_ERROR_STRICT:
5154 break;
5155 case _Py_ERROR_SURROGATEESCAPE:
5156 surrogateescape = 1;
5157 break;
5158 case _Py_ERROR_SURROGATEPASS:
5159 surrogatepass = 1;
5160 break;
5161 default:
5162 return -3;
5163 }
5164
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005165 /* Note: size will always be longer than the resulting Unicode
5166 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005167 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005168 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005169 }
5170
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005171 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005172 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005173 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005174 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005175
5176 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005177 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005178 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005179 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005180 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005181#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005182 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005183#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005184 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005185#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005186 if (ch > 0xFF) {
5187#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005188 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005189#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005190 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005191 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005192 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5193 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5194#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005195 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005196 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005197 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005198 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005199 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005200
5201 if (surrogateescape) {
5202 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5203 }
5204 else {
5205 /* Is it a valid three-byte code? */
5206 if (surrogatepass
5207 && (e - s) >= 3
5208 && (s[0] & 0xf0) == 0xe0
5209 && (s[1] & 0xc0) == 0x80
5210 && (s[2] & 0xc0) == 0x80)
5211 {
5212 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5213 s += 3;
5214 unicode[outpos++] = ch;
5215 }
5216 else {
5217 PyMem_RawFree(unicode );
5218 if (reason != NULL) {
5219 switch (ch) {
5220 case 0:
5221 *reason = "unexpected end of data";
5222 break;
5223 case 1:
5224 *reason = "invalid start byte";
5225 break;
5226 /* 2, 3, 4 */
5227 default:
5228 *reason = "invalid continuation byte";
5229 break;
5230 }
5231 }
5232 if (wlen != NULL) {
5233 *wlen = s - orig_s;
5234 }
5235 return -2;
5236 }
5237 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005238 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005239 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005240 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005241 if (wlen) {
5242 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005243 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005244 *wstr = unicode;
5245 return 0;
5246}
5247
Victor Stinner5f9cf232019-03-19 01:46:25 +01005248
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005249wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005250_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5251 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005252{
5253 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005254 int res = _Py_DecodeUTF8Ex(arg, arglen,
5255 &wstr, wlen,
5256 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005257 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005258 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5259 assert(res != -3);
5260 if (wlen) {
5261 *wlen = (size_t)res;
5262 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005263 return NULL;
5264 }
5265 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005266}
5267
Antoine Pitrouab868312009-01-10 15:40:25 +00005268
Victor Stinnere47e6982017-12-21 15:45:16 +01005269/* UTF-8 encoder using the surrogateescape error handler .
5270
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005271 On success, return 0 and write the newly allocated character string (use
5272 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005273
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005274 On encoding failure, return -2 and write the position of the invalid
5275 surrogate character into *error_pos (if error_pos is set) and the decoding
5276 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005277
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005278 On memory allocation failure, return -1. */
5279int
5280_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005281 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005282{
5283 const Py_ssize_t max_char_size = 4;
5284 Py_ssize_t len = wcslen(text);
5285
5286 assert(len >= 0);
5287
Victor Stinner3d4226a2018-08-29 22:21:32 +02005288 int surrogateescape = 0;
5289 int surrogatepass = 0;
5290 switch (errors)
5291 {
5292 case _Py_ERROR_STRICT:
5293 break;
5294 case _Py_ERROR_SURROGATEESCAPE:
5295 surrogateescape = 1;
5296 break;
5297 case _Py_ERROR_SURROGATEPASS:
5298 surrogatepass = 1;
5299 break;
5300 default:
5301 return -3;
5302 }
5303
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005304 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5305 return -1;
5306 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005307 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005308 if (raw_malloc) {
5309 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005310 }
5311 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005312 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005313 }
5314 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005315 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005316 }
5317
5318 char *p = bytes;
5319 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005320 for (i = 0; i < len; ) {
5321 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005322 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005323 i++;
5324#if Py_UNICODE_SIZE == 2
5325 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5326 && i < len
5327 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5328 {
5329 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5330 i++;
5331 }
5332#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005333
5334 if (ch < 0x80) {
5335 /* Encode ASCII */
5336 *p++ = (char) ch;
5337
5338 }
5339 else if (ch < 0x0800) {
5340 /* Encode Latin-1 */
5341 *p++ = (char)(0xc0 | (ch >> 6));
5342 *p++ = (char)(0x80 | (ch & 0x3f));
5343 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005344 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005345 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005346 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005347 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005348 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005349 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005350 if (reason != NULL) {
5351 *reason = "encoding error";
5352 }
5353 if (raw_malloc) {
5354 PyMem_RawFree(bytes);
5355 }
5356 else {
5357 PyMem_Free(bytes);
5358 }
5359 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005360 }
5361 *p++ = (char)(ch & 0xff);
5362 }
5363 else if (ch < 0x10000) {
5364 *p++ = (char)(0xe0 | (ch >> 12));
5365 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5366 *p++ = (char)(0x80 | (ch & 0x3f));
5367 }
5368 else { /* ch >= 0x10000 */
5369 assert(ch <= MAX_UNICODE);
5370 /* Encode UCS4 Unicode ordinals */
5371 *p++ = (char)(0xf0 | (ch >> 18));
5372 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5373 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5374 *p++ = (char)(0x80 | (ch & 0x3f));
5375 }
5376 }
5377 *p++ = '\0';
5378
5379 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005380 char *bytes2;
5381 if (raw_malloc) {
5382 bytes2 = PyMem_RawRealloc(bytes, final_size);
5383 }
5384 else {
5385 bytes2 = PyMem_Realloc(bytes, final_size);
5386 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005387 if (bytes2 == NULL) {
5388 if (error_pos != NULL) {
5389 *error_pos = (size_t)-1;
5390 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005391 if (raw_malloc) {
5392 PyMem_RawFree(bytes);
5393 }
5394 else {
5395 PyMem_Free(bytes);
5396 }
5397 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005398 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005399 *str = bytes2;
5400 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005401}
5402
5403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005404/* Primary internal function which creates utf8 encoded bytes objects.
5405
5406 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005407 and allocate exactly as much space needed at the end. Else allocate the
5408 maximum possible needed (4 result bytes per Unicode character), and return
5409 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005410*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005411static PyObject *
5412unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5413 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005415 if (!PyUnicode_Check(unicode)) {
5416 PyErr_BadArgument();
5417 return NULL;
5418 }
5419
5420 if (PyUnicode_READY(unicode) == -1)
5421 return NULL;
5422
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005423 if (PyUnicode_UTF8(unicode))
5424 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5425 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426
Inada Naoki02a4d572020-02-27 13:48:59 +09005427 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005428 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005429 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5430
5431 _PyBytesWriter writer;
5432 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005433
Benjamin Petersonead6b532011-12-20 17:23:42 -06005434 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005435 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005436 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005437 case PyUnicode_1BYTE_KIND:
5438 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5439 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005440 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5441 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005442 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005443 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5444 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005445 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005446 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5447 break;
Tim Peters602f7402002-04-27 18:03:26 +00005448 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005449
5450 if (end == NULL) {
5451 _PyBytesWriter_Dealloc(&writer);
5452 return NULL;
5453 }
5454 return _PyBytesWriter_Finish(&writer, end);
5455}
5456
5457static int
5458unicode_fill_utf8(PyObject *unicode)
5459{
5460 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5461 assert(!PyUnicode_IS_ASCII(unicode));
5462
5463 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005464 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005465 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5466
5467 _PyBytesWriter writer;
5468 char *end;
5469
5470 switch (kind) {
5471 default:
5472 Py_UNREACHABLE();
5473 case PyUnicode_1BYTE_KIND:
5474 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5475 _Py_ERROR_STRICT, NULL);
5476 break;
5477 case PyUnicode_2BYTE_KIND:
5478 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5479 _Py_ERROR_STRICT, NULL);
5480 break;
5481 case PyUnicode_4BYTE_KIND:
5482 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5483 _Py_ERROR_STRICT, NULL);
5484 break;
5485 }
5486 if (end == NULL) {
5487 _PyBytesWriter_Dealloc(&writer);
5488 return -1;
5489 }
5490
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005491 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005492 PyBytes_AS_STRING(writer.buffer);
5493 Py_ssize_t len = end - start;
5494
5495 char *cache = PyObject_MALLOC(len + 1);
5496 if (cache == NULL) {
5497 _PyBytesWriter_Dealloc(&writer);
5498 PyErr_NoMemory();
5499 return -1;
5500 }
5501 _PyUnicode_UTF8(unicode) = cache;
5502 _PyUnicode_UTF8_LENGTH(unicode) = len;
5503 memcpy(cache, start, len);
5504 cache[len] = '\0';
5505 _PyBytesWriter_Dealloc(&writer);
5506 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507}
5508
Alexander Belopolsky40018472011-02-26 01:02:56 +00005509PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005510_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5511{
5512 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5513}
5514
5515
5516PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005517PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5518 Py_ssize_t size,
5519 const char *errors)
5520{
5521 PyObject *v, *unicode;
5522
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005523 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005524 if (unicode == NULL)
5525 return NULL;
5526 v = _PyUnicode_AsUTF8String(unicode, errors);
5527 Py_DECREF(unicode);
5528 return v;
5529}
5530
5531PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005532PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005534 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535}
5536
Walter Dörwald41980ca2007-08-16 21:55:45 +00005537/* --- UTF-32 Codec ------------------------------------------------------- */
5538
5539PyObject *
5540PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 Py_ssize_t size,
5542 const char *errors,
5543 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005544{
5545 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5546}
5547
5548PyObject *
5549PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 Py_ssize_t size,
5551 const char *errors,
5552 int *byteorder,
5553 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005554{
5555 const char *starts = s;
5556 Py_ssize_t startinpos;
5557 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005558 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005559 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005560 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005561 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005562 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005563 PyObject *errorHandler = NULL;
5564 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005565
Andy Lestere6be9b52020-02-11 20:28:35 -06005566 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005567 e = q + size;
5568
5569 if (byteorder)
5570 bo = *byteorder;
5571
5572 /* Check for BOM marks (U+FEFF) in the input and adjust current
5573 byte order setting accordingly. In native mode, the leading BOM
5574 mark is skipped, in all other modes, it is copied to the output
5575 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005576 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005577 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005578 if (bom == 0x0000FEFF) {
5579 bo = -1;
5580 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005582 else if (bom == 0xFFFE0000) {
5583 bo = 1;
5584 q += 4;
5585 }
5586 if (byteorder)
5587 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005588 }
5589
Victor Stinnere64322e2012-10-30 23:12:47 +01005590 if (q == e) {
5591 if (consumed)
5592 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005593 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005594 }
5595
Victor Stinnere64322e2012-10-30 23:12:47 +01005596#ifdef WORDS_BIGENDIAN
5597 le = bo < 0;
5598#else
5599 le = bo <= 0;
5600#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005601 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005602
Victor Stinner8f674cc2013-04-17 23:02:17 +02005603 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005604 writer.min_length = (e - q + 3) / 4;
5605 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005606 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005607
Victor Stinnere64322e2012-10-30 23:12:47 +01005608 while (1) {
5609 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005610 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005611
Victor Stinnere64322e2012-10-30 23:12:47 +01005612 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005613 enum PyUnicode_Kind kind = writer.kind;
5614 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005615 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005616 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005617 if (le) {
5618 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005619 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005620 if (ch > maxch)
5621 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005622 if (kind != PyUnicode_1BYTE_KIND &&
5623 Py_UNICODE_IS_SURROGATE(ch))
5624 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005625 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005626 q += 4;
5627 } while (q <= last);
5628 }
5629 else {
5630 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005631 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005632 if (ch > maxch)
5633 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005634 if (kind != PyUnicode_1BYTE_KIND &&
5635 Py_UNICODE_IS_SURROGATE(ch))
5636 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005638 q += 4;
5639 } while (q <= last);
5640 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005641 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005642 }
5643
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005644 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005645 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005646 startinpos = ((const char *)q) - starts;
5647 endinpos = startinpos + 4;
5648 }
5649 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005650 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005652 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005654 startinpos = ((const char *)q) - starts;
5655 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005657 else {
5658 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005659 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005660 goto onError;
5661 q += 4;
5662 continue;
5663 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005664 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005665 startinpos = ((const char *)q) - starts;
5666 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005668
5669 /* The remaining input chars are ignored if the callback
5670 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005671 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005673 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005675 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005677 }
5678
Walter Dörwald41980ca2007-08-16 21:55:45 +00005679 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005681
Walter Dörwald41980ca2007-08-16 21:55:45 +00005682 Py_XDECREF(errorHandler);
5683 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005684 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005685
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005687 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005688 Py_XDECREF(errorHandler);
5689 Py_XDECREF(exc);
5690 return NULL;
5691}
5692
5693PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005694_PyUnicode_EncodeUTF32(PyObject *str,
5695 const char *errors,
5696 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005697{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005698 enum PyUnicode_Kind kind;
5699 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005700 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005701 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005702 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005703#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005704 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005705#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005706 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005707#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005708 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005709 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005710 PyObject *errorHandler = NULL;
5711 PyObject *exc = NULL;
5712 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005713
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005714 if (!PyUnicode_Check(str)) {
5715 PyErr_BadArgument();
5716 return NULL;
5717 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005718 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005719 return NULL;
5720 kind = PyUnicode_KIND(str);
5721 data = PyUnicode_DATA(str);
5722 len = PyUnicode_GET_LENGTH(str);
5723
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005724 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005725 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005726 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005727 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005728 if (v == NULL)
5729 return NULL;
5730
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005731 /* output buffer is 4-bytes aligned */
5732 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005733 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005734 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005735 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005736 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005737 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005738
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005739 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005740 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005741 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005742 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005743 else
5744 encoding = "utf-32";
5745
5746 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005747 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5748 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005749 }
5750
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005751 pos = 0;
5752 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005754
5755 if (kind == PyUnicode_2BYTE_KIND) {
5756 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5757 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005759 else {
5760 assert(kind == PyUnicode_4BYTE_KIND);
5761 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5762 &out, native_ordering);
5763 }
5764 if (pos == len)
5765 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005766
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 rep = unicode_encode_call_errorhandler(
5768 errors, &errorHandler,
5769 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005770 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005771 if (!rep)
5772 goto error;
5773
5774 if (PyBytes_Check(rep)) {
5775 repsize = PyBytes_GET_SIZE(rep);
5776 if (repsize & 3) {
5777 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005778 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005779 "surrogates not allowed");
5780 goto error;
5781 }
5782 moreunits = repsize / 4;
5783 }
5784 else {
5785 assert(PyUnicode_Check(rep));
5786 if (PyUnicode_READY(rep) < 0)
5787 goto error;
5788 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5789 if (!PyUnicode_IS_ASCII(rep)) {
5790 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005791 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005792 "surrogates not allowed");
5793 goto error;
5794 }
5795 }
5796
5797 /* four bytes are reserved for each surrogate */
5798 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005799 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005800 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005801 /* integer overflow */
5802 PyErr_NoMemory();
5803 goto error;
5804 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005805 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005806 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005807 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005808 }
5809
5810 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005811 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005812 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005813 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005814 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005815 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5816 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005817 }
5818
5819 Py_CLEAR(rep);
5820 }
5821
5822 /* Cut back to size actually needed. This is necessary for, for example,
5823 encoding of a string containing isolated surrogates and the 'ignore'
5824 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005825 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005826 if (nsize != PyBytes_GET_SIZE(v))
5827 _PyBytes_Resize(&v, nsize);
5828 Py_XDECREF(errorHandler);
5829 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005830 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005831 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005832 error:
5833 Py_XDECREF(rep);
5834 Py_XDECREF(errorHandler);
5835 Py_XDECREF(exc);
5836 Py_XDECREF(v);
5837 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005838}
5839
Alexander Belopolsky40018472011-02-26 01:02:56 +00005840PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005841PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5842 Py_ssize_t size,
5843 const char *errors,
5844 int byteorder)
5845{
5846 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005847 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005848 if (tmp == NULL)
5849 return NULL;
5850 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5851 Py_DECREF(tmp);
5852 return result;
5853}
5854
5855PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005856PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005857{
Victor Stinnerb960b342011-11-20 19:12:52 +01005858 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005859}
5860
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861/* --- UTF-16 Codec ------------------------------------------------------- */
5862
Tim Peters772747b2001-08-09 22:21:55 +00005863PyObject *
5864PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 Py_ssize_t size,
5866 const char *errors,
5867 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868{
Walter Dörwald69652032004-09-07 20:24:22 +00005869 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5870}
5871
5872PyObject *
5873PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 Py_ssize_t size,
5875 const char *errors,
5876 int *byteorder,
5877 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005880 Py_ssize_t startinpos;
5881 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005882 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005883 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005884 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005885 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005886 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005887 PyObject *errorHandler = NULL;
5888 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005889 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890
Andy Lestere6be9b52020-02-11 20:28:35 -06005891 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005892 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893
5894 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005895 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005897 /* Check for BOM marks (U+FEFF) in the input and adjust current
5898 byte order setting accordingly. In native mode, the leading BOM
5899 mark is skipped, in all other modes, it is copied to the output
5900 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005901 if (bo == 0 && size >= 2) {
5902 const Py_UCS4 bom = (q[1] << 8) | q[0];
5903 if (bom == 0xFEFF) {
5904 q += 2;
5905 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005907 else if (bom == 0xFFFE) {
5908 q += 2;
5909 bo = 1;
5910 }
5911 if (byteorder)
5912 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914
Antoine Pitrou63065d72012-05-15 23:48:04 +02005915 if (q == e) {
5916 if (consumed)
5917 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005918 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005919 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005920
Christian Heimes743e0cd2012-10-17 23:52:17 +02005921#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005922 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005923 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005924#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005925 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005926 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005927#endif
Tim Peters772747b2001-08-09 22:21:55 +00005928
Antoine Pitrou63065d72012-05-15 23:48:04 +02005929 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005930 character count normally. Error handler will take care of
5931 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005932 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005933 writer.min_length = (e - q + 1) / 2;
5934 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005935 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005936
Antoine Pitrou63065d72012-05-15 23:48:04 +02005937 while (1) {
5938 Py_UCS4 ch = 0;
5939 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005940 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005941 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005942 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005943 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005944 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005945 native_ordering);
5946 else
5947 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005948 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005949 native_ordering);
5950 } else if (kind == PyUnicode_2BYTE_KIND) {
5951 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005952 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005953 native_ordering);
5954 } else {
5955 assert(kind == PyUnicode_4BYTE_KIND);
5956 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005957 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005958 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005959 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005960 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961
Antoine Pitrou63065d72012-05-15 23:48:04 +02005962 switch (ch)
5963 {
5964 case 0:
5965 /* remaining byte at the end? (size should be even) */
5966 if (q == e || consumed)
5967 goto End;
5968 errmsg = "truncated data";
5969 startinpos = ((const char *)q) - starts;
5970 endinpos = ((const char *)e) - starts;
5971 break;
5972 /* The remaining input chars are ignored if the callback
5973 chooses to skip the input */
5974 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005975 q -= 2;
5976 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005977 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005978 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005979 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005980 endinpos = ((const char *)e) - starts;
5981 break;
5982 case 2:
5983 errmsg = "illegal encoding";
5984 startinpos = ((const char *)q) - 2 - starts;
5985 endinpos = startinpos + 2;
5986 break;
5987 case 3:
5988 errmsg = "illegal UTF-16 surrogate";
5989 startinpos = ((const char *)q) - 4 - starts;
5990 endinpos = startinpos + 2;
5991 break;
5992 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005993 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005994 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 continue;
5996 }
5997
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005998 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005999 errors,
6000 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006001 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006002 &starts,
6003 (const char **)&e,
6004 &startinpos,
6005 &endinpos,
6006 &exc,
6007 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006008 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 }
6011
Antoine Pitrou63065d72012-05-15 23:48:04 +02006012End:
Walter Dörwald69652032004-09-07 20:24:22 +00006013 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006015
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 Py_XDECREF(errorHandler);
6017 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006018 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006021 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 Py_XDECREF(errorHandler);
6023 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 return NULL;
6025}
6026
Tim Peters772747b2001-08-09 22:21:55 +00006027PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006028_PyUnicode_EncodeUTF16(PyObject *str,
6029 const char *errors,
6030 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006032 enum PyUnicode_Kind kind;
6033 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006035 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006036 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006037 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006038#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006039 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006040#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006041 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006042#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006043 const char *encoding;
6044 Py_ssize_t nsize, pos;
6045 PyObject *errorHandler = NULL;
6046 PyObject *exc = NULL;
6047 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006048
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006049 if (!PyUnicode_Check(str)) {
6050 PyErr_BadArgument();
6051 return NULL;
6052 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006053 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006054 return NULL;
6055 kind = PyUnicode_KIND(str);
6056 data = PyUnicode_DATA(str);
6057 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006058
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006059 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006060 if (kind == PyUnicode_4BYTE_KIND) {
6061 const Py_UCS4 *in = (const Py_UCS4 *)data;
6062 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006063 while (in < end) {
6064 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006065 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006066 }
6067 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006068 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006069 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006071 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006072 nsize = len + pairs + (byteorder == 0);
6073 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006074 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006076 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006078 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006079 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006080 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006081 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006082 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006083 }
6084 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006085 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006086 }
Tim Peters772747b2001-08-09 22:21:55 +00006087
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006088 if (kind == PyUnicode_1BYTE_KIND) {
6089 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6090 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006091 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006092
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006093 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006094 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006095 }
6096 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006097 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006098 }
6099 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006100 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006101 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006102
6103 pos = 0;
6104 while (pos < len) {
6105 Py_ssize_t repsize, moreunits;
6106
6107 if (kind == PyUnicode_2BYTE_KIND) {
6108 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6109 &out, native_ordering);
6110 }
6111 else {
6112 assert(kind == PyUnicode_4BYTE_KIND);
6113 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6114 &out, native_ordering);
6115 }
6116 if (pos == len)
6117 break;
6118
6119 rep = unicode_encode_call_errorhandler(
6120 errors, &errorHandler,
6121 encoding, "surrogates not allowed",
6122 str, &exc, pos, pos + 1, &pos);
6123 if (!rep)
6124 goto error;
6125
6126 if (PyBytes_Check(rep)) {
6127 repsize = PyBytes_GET_SIZE(rep);
6128 if (repsize & 1) {
6129 raise_encode_exception(&exc, encoding,
6130 str, pos - 1, pos,
6131 "surrogates not allowed");
6132 goto error;
6133 }
6134 moreunits = repsize / 2;
6135 }
6136 else {
6137 assert(PyUnicode_Check(rep));
6138 if (PyUnicode_READY(rep) < 0)
6139 goto error;
6140 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6141 if (!PyUnicode_IS_ASCII(rep)) {
6142 raise_encode_exception(&exc, encoding,
6143 str, pos - 1, pos,
6144 "surrogates not allowed");
6145 goto error;
6146 }
6147 }
6148
6149 /* two bytes are reserved for each surrogate */
6150 if (moreunits > 1) {
6151 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006152 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006153 /* integer overflow */
6154 PyErr_NoMemory();
6155 goto error;
6156 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006157 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006158 goto error;
6159 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6160 }
6161
6162 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006163 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006164 out += moreunits;
6165 } else /* rep is unicode */ {
6166 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6167 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6168 &out, native_ordering);
6169 }
6170
6171 Py_CLEAR(rep);
6172 }
6173
6174 /* Cut back to size actually needed. This is necessary for, for example,
6175 encoding of a string containing isolated surrogates and the 'ignore' handler
6176 is used. */
6177 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6178 if (nsize != PyBytes_GET_SIZE(v))
6179 _PyBytes_Resize(&v, nsize);
6180 Py_XDECREF(errorHandler);
6181 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006182 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006183 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006184 error:
6185 Py_XDECREF(rep);
6186 Py_XDECREF(errorHandler);
6187 Py_XDECREF(exc);
6188 Py_XDECREF(v);
6189 return NULL;
6190#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191}
6192
Alexander Belopolsky40018472011-02-26 01:02:56 +00006193PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006194PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6195 Py_ssize_t size,
6196 const char *errors,
6197 int byteorder)
6198{
6199 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006200 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006201 if (tmp == NULL)
6202 return NULL;
6203 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6204 Py_DECREF(tmp);
6205 return result;
6206}
6207
6208PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006209PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006211 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212}
6213
6214/* --- Unicode Escape Codec ----------------------------------------------- */
6215
Fredrik Lundh06d12682001-01-24 07:59:11 +00006216static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006217
Alexander Belopolsky40018472011-02-26 01:02:56 +00006218PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006219_PyUnicode_DecodeUnicodeEscape(const char *s,
6220 Py_ssize_t size,
6221 const char *errors,
6222 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006224 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006225 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006227 PyObject *errorHandler = NULL;
6228 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006229
Eric V. Smith42454af2016-10-31 09:22:08 -04006230 // so we can remember if we've seen an invalid escape char or not
6231 *first_invalid_escape = NULL;
6232
Victor Stinner62ec3312016-09-06 17:04:34 -07006233 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006234 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 }
6236 /* Escaped strings will always be longer than the resulting
6237 Unicode string, so we start with size here and then reduce the
6238 length after conversion to the true value.
6239 (but if the error callback returns a long replacement string
6240 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006241 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 writer.min_length = size;
6243 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6244 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006245 }
6246
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 end = s + size;
6248 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 unsigned char c = (unsigned char) *s++;
6250 Py_UCS4 ch;
6251 int count;
6252 Py_ssize_t startinpos;
6253 Py_ssize_t endinpos;
6254 const char *message;
6255
6256#define WRITE_ASCII_CHAR(ch) \
6257 do { \
6258 assert(ch <= 127); \
6259 assert(writer.pos < writer.size); \
6260 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6261 } while(0)
6262
6263#define WRITE_CHAR(ch) \
6264 do { \
6265 if (ch <= writer.maxchar) { \
6266 assert(writer.pos < writer.size); \
6267 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6268 } \
6269 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6270 goto onError; \
6271 } \
6272 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273
6274 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006275 if (c != '\\') {
6276 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 continue;
6278 }
6279
Victor Stinner62ec3312016-09-06 17:04:34 -07006280 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006282 if (s >= end) {
6283 message = "\\ at end of string";
6284 goto error;
6285 }
6286 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006287
Victor Stinner62ec3312016-09-06 17:04:34 -07006288 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006289 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006292 case '\n': continue;
6293 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6294 case '\'': WRITE_ASCII_CHAR('\''); continue;
6295 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6296 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006297 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6299 case 't': WRITE_ASCII_CHAR('\t'); continue;
6300 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6301 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006302 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006304 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 case '0': case '1': case '2': case '3':
6309 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006310 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006311 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006312 ch = (ch<<3) + *s++ - '0';
6313 if (s < end && '0' <= *s && *s <= '7') {
6314 ch = (ch<<3) + *s++ - '0';
6315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 WRITE_CHAR(ch);
6318 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 /* hex escapes */
6321 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006323 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006324 message = "truncated \\xXX escape";
6325 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006329 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006330 message = "truncated \\uXXXX escape";
6331 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006334 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006336 message = "truncated \\UXXXXXXXX escape";
6337 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006338 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006339 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 ch <<= 4;
6341 if (c >= '0' && c <= '9') {
6342 ch += c - '0';
6343 }
6344 else if (c >= 'a' && c <= 'f') {
6345 ch += c - ('a' - 10);
6346 }
6347 else if (c >= 'A' && c <= 'F') {
6348 ch += c - ('A' - 10);
6349 }
6350 else {
6351 break;
6352 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006353 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006354 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006355 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006356 }
6357
6358 /* when we get here, ch is a 32-bit unicode character */
6359 if (ch > MAX_UNICODE) {
6360 message = "illegal Unicode character";
6361 goto error;
6362 }
6363
6364 WRITE_CHAR(ch);
6365 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006366
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006368 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006369 if (ucnhash_CAPI == NULL) {
6370 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006371 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6372 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006373 if (ucnhash_CAPI == NULL) {
6374 PyErr_SetString(
6375 PyExc_UnicodeError,
6376 "\\N escapes not supported (can't load unicodedata module)"
6377 );
6378 goto onError;
6379 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006380 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006381
6382 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006383 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006384 const char *start = ++s;
6385 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006386 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006387 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006388 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006389 namelen = s - start;
6390 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006391 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006392 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006393 ch = 0xffffffff; /* in case 'getcode' messes up */
6394 if (namelen <= INT_MAX &&
6395 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6396 &ch, 0)) {
6397 assert(ch <= MAX_UNICODE);
6398 WRITE_CHAR(ch);
6399 continue;
6400 }
6401 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006402 }
6403 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006404 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006405
6406 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006407 if (*first_invalid_escape == NULL) {
6408 *first_invalid_escape = s-1; /* Back up one char, since we've
6409 already incremented s. */
6410 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 WRITE_ASCII_CHAR('\\');
6412 WRITE_CHAR(c);
6413 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006415
6416 error:
6417 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006419 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006420 errors, &errorHandler,
6421 "unicodeescape", message,
6422 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006424 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006426 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006427
6428#undef WRITE_ASCII_CHAR
6429#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006431
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006432 Py_XDECREF(errorHandler);
6433 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006434 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006435
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006437 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 Py_XDECREF(errorHandler);
6439 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 return NULL;
6441}
6442
Eric V. Smith42454af2016-10-31 09:22:08 -04006443PyObject *
6444PyUnicode_DecodeUnicodeEscape(const char *s,
6445 Py_ssize_t size,
6446 const char *errors)
6447{
6448 const char *first_invalid_escape;
6449 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6450 &first_invalid_escape);
6451 if (result == NULL)
6452 return NULL;
6453 if (first_invalid_escape != NULL) {
6454 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6455 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006456 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006457 Py_DECREF(result);
6458 return NULL;
6459 }
6460 }
6461 return result;
6462}
6463
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006464/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465
Alexander Belopolsky40018472011-02-26 01:02:56 +00006466PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006467PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006469 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006470 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006473 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006474 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475
Ezio Melottie7f90372012-10-05 03:33:31 +03006476 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006477 escape.
6478
Ezio Melottie7f90372012-10-05 03:33:31 +03006479 For UCS1 strings it's '\xxx', 4 bytes per source character.
6480 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6481 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006482 */
6483
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006484 if (!PyUnicode_Check(unicode)) {
6485 PyErr_BadArgument();
6486 return NULL;
6487 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006488 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006489 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 }
Victor Stinner358af132015-10-12 22:36:57 +02006491
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006493 if (len == 0) {
6494 return PyBytes_FromStringAndSize(NULL, 0);
6495 }
6496
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006497 kind = PyUnicode_KIND(unicode);
6498 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6500 bytes, and 1 byte characters 4. */
6501 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006502 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006503 return PyErr_NoMemory();
6504 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006505 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006506 if (repr == NULL) {
6507 return NULL;
6508 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006509
Victor Stinner62ec3312016-09-06 17:04:34 -07006510 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006511 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006512 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006513
Victor Stinner62ec3312016-09-06 17:04:34 -07006514 /* U+0000-U+00ff range */
6515 if (ch < 0x100) {
6516 if (ch >= ' ' && ch < 127) {
6517 if (ch != '\\') {
6518 /* Copy printable US ASCII as-is */
6519 *p++ = (char) ch;
6520 }
6521 /* Escape backslashes */
6522 else {
6523 *p++ = '\\';
6524 *p++ = '\\';
6525 }
6526 }
Victor Stinner358af132015-10-12 22:36:57 +02006527
Victor Stinner62ec3312016-09-06 17:04:34 -07006528 /* Map special whitespace to '\t', \n', '\r' */
6529 else if (ch == '\t') {
6530 *p++ = '\\';
6531 *p++ = 't';
6532 }
6533 else if (ch == '\n') {
6534 *p++ = '\\';
6535 *p++ = 'n';
6536 }
6537 else if (ch == '\r') {
6538 *p++ = '\\';
6539 *p++ = 'r';
6540 }
6541
6542 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6543 else {
6544 *p++ = '\\';
6545 *p++ = 'x';
6546 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6547 *p++ = Py_hexdigits[ch & 0x000F];
6548 }
Tim Petersced69f82003-09-16 20:30:58 +00006549 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006550 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006551 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 *p++ = '\\';
6553 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006554 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6555 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6556 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6557 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006559 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6560 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006561
Victor Stinner62ec3312016-09-06 17:04:34 -07006562 /* Make sure that the first two digits are zero */
6563 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006564 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006565 *p++ = 'U';
6566 *p++ = '0';
6567 *p++ = '0';
6568 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6569 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6570 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6571 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6572 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6573 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576
Victor Stinner62ec3312016-09-06 17:04:34 -07006577 assert(p - PyBytes_AS_STRING(repr) > 0);
6578 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6579 return NULL;
6580 }
6581 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582}
6583
Alexander Belopolsky40018472011-02-26 01:02:56 +00006584PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006585PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6586 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006588 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006589 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006590 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006592 }
6593
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006594 result = PyUnicode_AsUnicodeEscapeString(tmp);
6595 Py_DECREF(tmp);
6596 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597}
6598
6599/* --- Raw Unicode Escape Codec ------------------------------------------- */
6600
Alexander Belopolsky40018472011-02-26 01:02:56 +00006601PyObject *
6602PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006603 Py_ssize_t size,
6604 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006607 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006609 PyObject *errorHandler = NULL;
6610 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006611
Victor Stinner62ec3312016-09-06 17:04:34 -07006612 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006613 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006614 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006615
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 /* Escaped strings will always be longer than the resulting
6617 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006618 length after conversion to the true value. (But decoding error
6619 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006620 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006621 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006622 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6623 goto onError;
6624 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006625
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 end = s + size;
6627 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006628 unsigned char c = (unsigned char) *s++;
6629 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006630 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006631 Py_ssize_t startinpos;
6632 Py_ssize_t endinpos;
6633 const char *message;
6634
6635#define WRITE_CHAR(ch) \
6636 do { \
6637 if (ch <= writer.maxchar) { \
6638 assert(writer.pos < writer.size); \
6639 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6640 } \
6641 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6642 goto onError; \
6643 } \
6644 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006647 if (c != '\\' || s >= end) {
6648 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006650 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006651
Victor Stinner62ec3312016-09-06 17:04:34 -07006652 c = (unsigned char) *s++;
6653 if (c == 'u') {
6654 count = 4;
6655 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006657 else if (c == 'U') {
6658 count = 8;
6659 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006660 }
6661 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006662 assert(writer.pos < writer.size);
6663 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6664 WRITE_CHAR(c);
6665 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006666 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006667 startinpos = s - starts - 2;
6668
6669 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6670 for (ch = 0; count && s < end; ++s, --count) {
6671 c = (unsigned char)*s;
6672 ch <<= 4;
6673 if (c >= '0' && c <= '9') {
6674 ch += c - '0';
6675 }
6676 else if (c >= 'a' && c <= 'f') {
6677 ch += c - ('a' - 10);
6678 }
6679 else if (c >= 'A' && c <= 'F') {
6680 ch += c - ('A' - 10);
6681 }
6682 else {
6683 break;
6684 }
6685 }
6686 if (!count) {
6687 if (ch <= MAX_UNICODE) {
6688 WRITE_CHAR(ch);
6689 continue;
6690 }
6691 message = "\\Uxxxxxxxx out of range";
6692 }
6693
6694 endinpos = s-starts;
6695 writer.min_length = end - s + writer.pos;
6696 if (unicode_decode_call_errorhandler_writer(
6697 errors, &errorHandler,
6698 "rawunicodeescape", message,
6699 &starts, &end, &startinpos, &endinpos, &exc, &s,
6700 &writer)) {
6701 goto onError;
6702 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006703 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006704
6705#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 Py_XDECREF(errorHandler);
6708 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006709 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006710
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006712 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 Py_XDECREF(errorHandler);
6714 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006716
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006719
Alexander Belopolsky40018472011-02-26 01:02:56 +00006720PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006721PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722{
Victor Stinner62ec3312016-09-06 17:04:34 -07006723 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006725 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006726 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006727 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006728 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006730 if (!PyUnicode_Check(unicode)) {
6731 PyErr_BadArgument();
6732 return NULL;
6733 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006734 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006735 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006736 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006737 kind = PyUnicode_KIND(unicode);
6738 data = PyUnicode_DATA(unicode);
6739 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006740 if (kind == PyUnicode_1BYTE_KIND) {
6741 return PyBytes_FromStringAndSize(data, len);
6742 }
Victor Stinner0e368262011-11-10 20:12:49 +01006743
Victor Stinner62ec3312016-09-06 17:04:34 -07006744 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6745 bytes, and 1 byte characters 4. */
6746 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006747
Victor Stinner62ec3312016-09-06 17:04:34 -07006748 if (len > PY_SSIZE_T_MAX / expandsize) {
6749 return PyErr_NoMemory();
6750 }
6751 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6752 if (repr == NULL) {
6753 return NULL;
6754 }
6755 if (len == 0) {
6756 return repr;
6757 }
6758
6759 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006760 for (pos = 0; pos < len; pos++) {
6761 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006762
Victor Stinner62ec3312016-09-06 17:04:34 -07006763 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6764 if (ch < 0x100) {
6765 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006766 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006767 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006768 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 *p++ = '\\';
6770 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006771 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6772 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6773 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6774 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006776 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6777 else {
6778 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6779 *p++ = '\\';
6780 *p++ = 'U';
6781 *p++ = '0';
6782 *p++ = '0';
6783 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6784 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6785 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6786 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6787 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6788 *p++ = Py_hexdigits[ch & 15];
6789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006791
Victor Stinner62ec3312016-09-06 17:04:34 -07006792 assert(p > PyBytes_AS_STRING(repr));
6793 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6794 return NULL;
6795 }
6796 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797}
6798
Alexander Belopolsky40018472011-02-26 01:02:56 +00006799PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006800PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6801 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006803 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006804 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006805 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006806 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006807 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6808 Py_DECREF(tmp);
6809 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810}
6811
6812/* --- Latin-1 Codec ------------------------------------------------------ */
6813
Alexander Belopolsky40018472011-02-26 01:02:56 +00006814PyObject *
6815PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006816 Py_ssize_t size,
6817 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006820 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821}
6822
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006823/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006824static void
6825make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006826 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006827 PyObject *unicode,
6828 Py_ssize_t startpos, Py_ssize_t endpos,
6829 const char *reason)
6830{
6831 if (*exceptionObject == NULL) {
6832 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006833 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006834 encoding, unicode, startpos, endpos, reason);
6835 }
6836 else {
6837 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6838 goto onError;
6839 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6840 goto onError;
6841 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6842 goto onError;
6843 return;
6844 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006845 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006846 }
6847}
6848
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006849/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006850static void
6851raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006852 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006853 PyObject *unicode,
6854 Py_ssize_t startpos, Py_ssize_t endpos,
6855 const char *reason)
6856{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006857 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006858 encoding, unicode, startpos, endpos, reason);
6859 if (*exceptionObject != NULL)
6860 PyCodec_StrictErrors(*exceptionObject);
6861}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006862
6863/* error handling callback helper:
6864 build arguments, call the callback and check the arguments,
6865 put the result into newpos and return the replacement string, which
6866 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006867static PyObject *
6868unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006869 PyObject **errorHandler,
6870 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006871 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006872 Py_ssize_t startpos, Py_ssize_t endpos,
6873 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006874{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006875 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006876 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006877 PyObject *restuple;
6878 PyObject *resunicode;
6879
6880 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006882 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006884 }
6885
Benjamin Petersonbac79492012-01-14 13:34:47 -05006886 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006887 return NULL;
6888 len = PyUnicode_GET_LENGTH(unicode);
6889
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006890 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006891 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006892 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006894
Petr Viktorinffd97532020-02-11 17:46:57 +01006895 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006899 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 Py_DECREF(restuple);
6901 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006902 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006903 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 &resunicode, newpos)) {
6905 Py_DECREF(restuple);
6906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006907 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006908 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6909 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6910 Py_DECREF(restuple);
6911 return NULL;
6912 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 *newpos = len + *newpos;
6915 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006916 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 Py_DECREF(restuple);
6918 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006919 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920 Py_INCREF(resunicode);
6921 Py_DECREF(restuple);
6922 return resunicode;
6923}
6924
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006926unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006927 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006928 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006929{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006930 /* input state */
6931 Py_ssize_t pos=0, size;
6932 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006933 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 /* pointer into the output */
6935 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006936 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6937 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006938 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006940 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006941 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006942 /* output object */
6943 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944
Benjamin Petersonbac79492012-01-14 13:34:47 -05006945 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006946 return NULL;
6947 size = PyUnicode_GET_LENGTH(unicode);
6948 kind = PyUnicode_KIND(unicode);
6949 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006950 /* allocate enough for a simple encoding without
6951 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006952 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006953 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006954
6955 _PyBytesWriter_Init(&writer);
6956 str = _PyBytesWriter_Alloc(&writer, size);
6957 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006958 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006960 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006961 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006962
Benjamin Peterson29060642009-01-31 22:14:21 +00006963 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006964 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006966 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006967 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006968 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006970 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006972 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006973 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006975
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006976 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006978
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006979 /* Only overallocate the buffer if it's not the last write */
6980 writer.overallocate = (collend < size);
6981
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006983 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006984 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006985
6986 switch (error_handler) {
6987 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006988 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006990
6991 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006992 memset(str, '?', collend - collstart);
6993 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006994 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006995 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006996 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 break;
Victor Stinner50149202015-09-22 00:26:54 +02006998
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006999 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007000 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007001 writer.min_size -= (collend - collstart);
7002 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007003 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007004 if (str == NULL)
7005 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007006 pos = collend;
7007 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007008
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007009 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007010 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007011 writer.min_size -= (collend - collstart);
7012 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007013 unicode, collstart, collend);
7014 if (str == NULL)
7015 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007016 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 break;
Victor Stinner50149202015-09-22 00:26:54 +02007018
Victor Stinnerc3713e92015-09-29 12:32:13 +02007019 case _Py_ERROR_SURROGATEESCAPE:
7020 for (i = collstart; i < collend; ++i) {
7021 ch = PyUnicode_READ(kind, data, i);
7022 if (ch < 0xdc80 || 0xdcff < ch) {
7023 /* Not a UTF-8b surrogate */
7024 break;
7025 }
7026 *str++ = (char)(ch - 0xdc00);
7027 ++pos;
7028 }
7029 if (i >= collend)
7030 break;
7031 collstart = pos;
7032 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007033 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007034
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007036 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7037 encoding, reason, unicode, &exc,
7038 collstart, collend, &newpos);
7039 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007041
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007042 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007043 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007044
Victor Stinner6bd525b2015-10-09 13:10:05 +02007045 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007046 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007047 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007048 PyBytes_AS_STRING(rep),
7049 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007050 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007051 else {
7052 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007053
Victor Stinner6bd525b2015-10-09 13:10:05 +02007054 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007056
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007057 if (limit == 256 ?
7058 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7059 !PyUnicode_IS_ASCII(rep))
7060 {
7061 /* Not all characters are smaller than limit */
7062 raise_encode_exception(&exc, encoding, unicode,
7063 collstart, collend, reason);
7064 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007065 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007066 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7067 str = _PyBytesWriter_WriteBytes(&writer, str,
7068 PyUnicode_DATA(rep),
7069 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007071 if (str == NULL)
7072 goto onError;
7073
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007074 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007075 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007076 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007077
7078 /* If overallocation was disabled, ensure that it was the last
7079 write. Otherwise, we missed an optimization */
7080 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007081 }
7082 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007083
Victor Stinner50149202015-09-22 00:26:54 +02007084 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007085 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007086 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007087
7088 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007089 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007090 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007091 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007092 Py_XDECREF(exc);
7093 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007094}
7095
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007096/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007097PyObject *
7098PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007099 Py_ssize_t size,
7100 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007102 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007103 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007104 if (unicode == NULL)
7105 return NULL;
7106 result = unicode_encode_ucs1(unicode, errors, 256);
7107 Py_DECREF(unicode);
7108 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109}
7110
Alexander Belopolsky40018472011-02-26 01:02:56 +00007111PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007112_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113{
7114 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 PyErr_BadArgument();
7116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007118 if (PyUnicode_READY(unicode) == -1)
7119 return NULL;
7120 /* Fast path: if it is a one-byte string, construct
7121 bytes object directly. */
7122 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7123 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7124 PyUnicode_GET_LENGTH(unicode));
7125 /* Non-Latin-1 characters present. Defer to above function to
7126 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007127 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007128}
7129
7130PyObject*
7131PyUnicode_AsLatin1String(PyObject *unicode)
7132{
7133 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134}
7135
7136/* --- 7-bit ASCII Codec -------------------------------------------------- */
7137
Alexander Belopolsky40018472011-02-26 01:02:56 +00007138PyObject *
7139PyUnicode_DecodeASCII(const char *s,
7140 Py_ssize_t size,
7141 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007143 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007144 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007145 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007146 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007147 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007148
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007150 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007151
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007153 if (size == 1 && (unsigned char)s[0] < 128)
7154 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007155
Inada Naoki770847a2019-06-24 12:30:24 +09007156 // Shortcut for simple case
7157 PyObject *u = PyUnicode_New(size, 127);
7158 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007159 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007160 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007161 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007162 if (outpos == size) {
7163 return u;
7164 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007165
Inada Naoki770847a2019-06-24 12:30:24 +09007166 _PyUnicodeWriter writer;
7167 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007168 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007169
Inada Naoki770847a2019-06-24 12:30:24 +09007170 s += outpos;
7171 int kind = writer.kind;
7172 void *data = writer.data;
7173 Py_ssize_t startinpos, endinpos;
7174
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007175 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007176 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007178 PyUnicode_WRITE(kind, data, writer.pos, c);
7179 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007181 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007183
7184 /* byte outsize range 0x00..0x7f: call the error handler */
7185
7186 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007187 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007188
7189 switch (error_handler)
7190 {
7191 case _Py_ERROR_REPLACE:
7192 case _Py_ERROR_SURROGATEESCAPE:
7193 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007194 but we may switch to UCS2 at the first write */
7195 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7196 goto onError;
7197 kind = writer.kind;
7198 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007199
7200 if (error_handler == _Py_ERROR_REPLACE)
7201 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7202 else
7203 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7204 writer.pos++;
7205 ++s;
7206 break;
7207
7208 case _Py_ERROR_IGNORE:
7209 ++s;
7210 break;
7211
7212 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 startinpos = s-starts;
7214 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007215 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007216 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 "ascii", "ordinal not in range(128)",
7218 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007219 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007221 kind = writer.kind;
7222 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007225 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007226 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007227 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007228
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007230 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007231 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007232 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233 return NULL;
7234}
7235
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007236/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007237PyObject *
7238PyUnicode_EncodeASCII(const Py_UNICODE *p,
7239 Py_ssize_t size,
7240 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007242 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007243 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007244 if (unicode == NULL)
7245 return NULL;
7246 result = unicode_encode_ucs1(unicode, errors, 128);
7247 Py_DECREF(unicode);
7248 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249}
7250
Alexander Belopolsky40018472011-02-26 01:02:56 +00007251PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007252_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253{
7254 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 PyErr_BadArgument();
7256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007258 if (PyUnicode_READY(unicode) == -1)
7259 return NULL;
7260 /* Fast path: if it is an ASCII-only string, construct bytes object
7261 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007262 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007263 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7264 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007265 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007266}
7267
7268PyObject *
7269PyUnicode_AsASCIIString(PyObject *unicode)
7270{
7271 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272}
7273
Steve Dowercc16be82016-09-08 10:35:16 -07007274#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007275
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007276/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007277
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007278#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007279#define NEED_RETRY
7280#endif
7281
Steve Dower7ebdda02019-08-21 16:22:33 -07007282/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7283 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7284 both cases also and avoids partial characters overrunning the
7285 length limit in MultiByteToWideChar on Windows */
7286#define DECODING_CHUNK_SIZE (INT_MAX/4)
7287
Victor Stinner3a50e702011-10-18 21:21:00 +02007288#ifndef WC_ERR_INVALID_CHARS
7289# define WC_ERR_INVALID_CHARS 0x0080
7290#endif
7291
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007292static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007293code_page_name(UINT code_page, PyObject **obj)
7294{
7295 *obj = NULL;
7296 if (code_page == CP_ACP)
7297 return "mbcs";
7298 if (code_page == CP_UTF7)
7299 return "CP_UTF7";
7300 if (code_page == CP_UTF8)
7301 return "CP_UTF8";
7302
7303 *obj = PyBytes_FromFormat("cp%u", code_page);
7304 if (*obj == NULL)
7305 return NULL;
7306 return PyBytes_AS_STRING(*obj);
7307}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308
Victor Stinner3a50e702011-10-18 21:21:00 +02007309static DWORD
7310decode_code_page_flags(UINT code_page)
7311{
7312 if (code_page == CP_UTF7) {
7313 /* The CP_UTF7 decoder only supports flags=0 */
7314 return 0;
7315 }
7316 else
7317 return MB_ERR_INVALID_CHARS;
7318}
7319
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007321 * Decode a byte string from a Windows code page into unicode object in strict
7322 * mode.
7323 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007324 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7325 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007327static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007328decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007329 wchar_t **buf,
7330 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007331 const char *in,
7332 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007334 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007335 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007336 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
7338 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007339 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007340 while ((outsize = MultiByteToWideChar(code_page, flags,
7341 in, insize, NULL, 0)) <= 0)
7342 {
7343 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7344 goto error;
7345 }
7346 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7347 flags = 0;
7348 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007349
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007350 /* Extend a wchar_t* buffer */
7351 Py_ssize_t n = *bufsize; /* Get the current length */
7352 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7353 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007354 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007355 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007356
7357 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007358 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7359 if (outsize <= 0)
7360 goto error;
7361 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007362
Victor Stinner3a50e702011-10-18 21:21:00 +02007363error:
7364 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7365 return -2;
7366 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007367 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368}
7369
Victor Stinner3a50e702011-10-18 21:21:00 +02007370/*
7371 * Decode a byte string from a code page into unicode object with an error
7372 * handler.
7373 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007374 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007375 * UnicodeDecodeError exception and returns -1 on error.
7376 */
7377static int
7378decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007379 wchar_t **buf,
7380 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007381 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007382 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007383{
7384 const char *startin = in;
7385 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007386 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 /* Ideally, we should get reason from FormatMessage. This is the Windows
7388 2000 English version of the message. */
7389 const char *reason = "No mapping for the Unicode character exists "
7390 "in the target code page.";
7391 /* each step cannot decode more than 1 character, but a character can be
7392 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007393 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007394 int insize;
7395 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 PyObject *errorHandler = NULL;
7397 PyObject *exc = NULL;
7398 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007399 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 DWORD err;
7401 int ret = -1;
7402
7403 assert(size > 0);
7404
7405 encoding = code_page_name(code_page, &encoding_obj);
7406 if (encoding == NULL)
7407 return -1;
7408
Victor Stinner7d00cc12014-03-17 23:08:06 +01007409 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7411 UnicodeDecodeError. */
7412 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7413 if (exc != NULL) {
7414 PyCodec_StrictErrors(exc);
7415 Py_CLEAR(exc);
7416 }
7417 goto error;
7418 }
7419
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007420 /* Extend a wchar_t* buffer */
7421 Py_ssize_t n = *bufsize; /* Get the current length */
7422 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7423 PyErr_NoMemory();
7424 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007426 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7427 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007429 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007430
7431 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 while (in < endin)
7433 {
7434 /* Decode a character */
7435 insize = 1;
7436 do
7437 {
7438 outsize = MultiByteToWideChar(code_page, flags,
7439 in, insize,
7440 buffer, Py_ARRAY_LENGTH(buffer));
7441 if (outsize > 0)
7442 break;
7443 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007444 if (err == ERROR_INVALID_FLAGS && flags) {
7445 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7446 flags = 0;
7447 continue;
7448 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 if (err != ERROR_NO_UNICODE_TRANSLATION
7450 && err != ERROR_INSUFFICIENT_BUFFER)
7451 {
7452 PyErr_SetFromWindowsErr(0);
7453 goto error;
7454 }
7455 insize++;
7456 }
7457 /* 4=maximum length of a UTF-8 sequence */
7458 while (insize <= 4 && (in + insize) <= endin);
7459
7460 if (outsize <= 0) {
7461 Py_ssize_t startinpos, endinpos, outpos;
7462
Victor Stinner7d00cc12014-03-17 23:08:06 +01007463 /* last character in partial decode? */
7464 if (in + insize >= endin && !final)
7465 break;
7466
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 startinpos = in - startin;
7468 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007469 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007470 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 errors, &errorHandler,
7472 encoding, reason,
7473 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007474 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 {
7476 goto error;
7477 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007478 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 }
7480 else {
7481 in += insize;
7482 memcpy(out, buffer, outsize * sizeof(wchar_t));
7483 out += outsize;
7484 }
7485 }
7486
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007487 /* Shrink the buffer */
7488 assert(out - *buf <= *bufsize);
7489 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007490 /* (in - startin) <= size and size is an int */
7491 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007492
7493error:
7494 Py_XDECREF(encoding_obj);
7495 Py_XDECREF(errorHandler);
7496 Py_XDECREF(exc);
7497 return ret;
7498}
7499
Victor Stinner3a50e702011-10-18 21:21:00 +02007500static PyObject *
7501decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007502 const char *s, Py_ssize_t size,
7503 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007504{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007505 wchar_t *buf = NULL;
7506 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007507 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007508
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 if (code_page < 0) {
7510 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7511 return NULL;
7512 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007513 if (size < 0) {
7514 PyErr_BadInternalCall();
7515 return NULL;
7516 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007517
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007518 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007520
Victor Stinner76a31a62011-11-04 00:05:13 +01007521 do
7522 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007523#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007524 if (size > DECODING_CHUNK_SIZE) {
7525 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007526 final = 0;
7527 done = 0;
7528 }
7529 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007530#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007531 {
7532 chunk_size = (int)size;
7533 final = (consumed == NULL);
7534 done = 1;
7535 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007536
Victor Stinner76a31a62011-11-04 00:05:13 +01007537 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007538 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007539 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007540 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007541 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007542
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007543 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007544 s, chunk_size);
7545 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007546 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007547 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007548 errors, final);
7549 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007550
7551 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007552 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007553 return NULL;
7554 }
7555
7556 if (consumed)
7557 *consumed += converted;
7558
7559 s += converted;
7560 size -= converted;
7561 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007562
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007563 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7564 PyMem_Free(buf);
7565 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007566}
7567
Alexander Belopolsky40018472011-02-26 01:02:56 +00007568PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007569PyUnicode_DecodeCodePageStateful(int code_page,
7570 const char *s,
7571 Py_ssize_t size,
7572 const char *errors,
7573 Py_ssize_t *consumed)
7574{
7575 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7576}
7577
7578PyObject *
7579PyUnicode_DecodeMBCSStateful(const char *s,
7580 Py_ssize_t size,
7581 const char *errors,
7582 Py_ssize_t *consumed)
7583{
7584 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7585}
7586
7587PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007588PyUnicode_DecodeMBCS(const char *s,
7589 Py_ssize_t size,
7590 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007591{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007592 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7593}
7594
Victor Stinner3a50e702011-10-18 21:21:00 +02007595static DWORD
7596encode_code_page_flags(UINT code_page, const char *errors)
7597{
7598 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007599 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 }
7601 else if (code_page == CP_UTF7) {
7602 /* CP_UTF7 only supports flags=0 */
7603 return 0;
7604 }
7605 else {
7606 if (errors != NULL && strcmp(errors, "replace") == 0)
7607 return 0;
7608 else
7609 return WC_NO_BEST_FIT_CHARS;
7610 }
7611}
7612
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007613/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 * Encode a Unicode string to a Windows code page into a byte string in strict
7615 * mode.
7616 *
7617 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007618 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007619 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007620static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007621encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007622 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007623 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007624{
Victor Stinner554f3f02010-06-16 23:33:54 +00007625 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 BOOL *pusedDefaultChar = &usedDefaultChar;
7627 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007628 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007629 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 const DWORD flags = encode_code_page_flags(code_page, NULL);
7631 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007632 /* Create a substring so that we can get the UTF-16 representation
7633 of just the slice under consideration. */
7634 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007635
Martin v. Löwis3d325192011-11-04 18:23:06 +01007636 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007637
Victor Stinner3a50e702011-10-18 21:21:00 +02007638 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007639 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007641 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007642
Victor Stinner2fc507f2011-11-04 20:06:39 +01007643 substring = PyUnicode_Substring(unicode, offset, offset+len);
7644 if (substring == NULL)
7645 return -1;
7646 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7647 if (p == NULL) {
7648 Py_DECREF(substring);
7649 return -1;
7650 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007651 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007652
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007653 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007654 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007655 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 NULL, 0,
7657 NULL, pusedDefaultChar);
7658 if (outsize <= 0)
7659 goto error;
7660 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007661 if (pusedDefaultChar && *pusedDefaultChar) {
7662 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007663 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007664 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007665
Victor Stinner3a50e702011-10-18 21:21:00 +02007666 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007668 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007669 if (*outbytes == NULL) {
7670 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007672 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007674 }
7675 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007677 const Py_ssize_t n = PyBytes_Size(*outbytes);
7678 if (outsize > PY_SSIZE_T_MAX - n) {
7679 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007680 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007682 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007683 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7684 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007685 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007686 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007687 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007688 }
7689
7690 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007691 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007692 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007693 out, outsize,
7694 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007695 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007696 if (outsize <= 0)
7697 goto error;
7698 if (pusedDefaultChar && *pusedDefaultChar)
7699 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007700 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007701
Victor Stinner3a50e702011-10-18 21:21:00 +02007702error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007703 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007704 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7705 return -2;
7706 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007707 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007708}
7709
Victor Stinner3a50e702011-10-18 21:21:00 +02007710/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007711 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007712 * error handler.
7713 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007714 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007715 * -1 on other error.
7716 */
7717static int
7718encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007719 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007720 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007721{
Victor Stinner3a50e702011-10-18 21:21:00 +02007722 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007723 Py_ssize_t pos = unicode_offset;
7724 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007725 /* Ideally, we should get reason from FormatMessage. This is the Windows
7726 2000 English version of the message. */
7727 const char *reason = "invalid character";
7728 /* 4=maximum length of a UTF-8 sequence */
7729 char buffer[4];
7730 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7731 Py_ssize_t outsize;
7732 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007733 PyObject *errorHandler = NULL;
7734 PyObject *exc = NULL;
7735 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007736 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007738 PyObject *rep;
7739 int ret = -1;
7740
7741 assert(insize > 0);
7742
7743 encoding = code_page_name(code_page, &encoding_obj);
7744 if (encoding == NULL)
7745 return -1;
7746
7747 if (errors == NULL || strcmp(errors, "strict") == 0) {
7748 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7749 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007750 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007751 if (exc != NULL) {
7752 PyCodec_StrictErrors(exc);
7753 Py_DECREF(exc);
7754 }
7755 Py_XDECREF(encoding_obj);
7756 return -1;
7757 }
7758
7759 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7760 pusedDefaultChar = &usedDefaultChar;
7761 else
7762 pusedDefaultChar = NULL;
7763
7764 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7765 PyErr_NoMemory();
7766 goto error;
7767 }
7768 outsize = insize * Py_ARRAY_LENGTH(buffer);
7769
7770 if (*outbytes == NULL) {
7771 /* Create string object */
7772 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7773 if (*outbytes == NULL)
7774 goto error;
7775 out = PyBytes_AS_STRING(*outbytes);
7776 }
7777 else {
7778 /* Extend string object */
7779 Py_ssize_t n = PyBytes_Size(*outbytes);
7780 if (n > PY_SSIZE_T_MAX - outsize) {
7781 PyErr_NoMemory();
7782 goto error;
7783 }
7784 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7785 goto error;
7786 out = PyBytes_AS_STRING(*outbytes) + n;
7787 }
7788
7789 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007790 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007791 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007792 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7793 wchar_t chars[2];
7794 int charsize;
7795 if (ch < 0x10000) {
7796 chars[0] = (wchar_t)ch;
7797 charsize = 1;
7798 }
7799 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007800 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7801 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007802 charsize = 2;
7803 }
7804
Victor Stinner3a50e702011-10-18 21:21:00 +02007805 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007806 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007807 buffer, Py_ARRAY_LENGTH(buffer),
7808 NULL, pusedDefaultChar);
7809 if (outsize > 0) {
7810 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7811 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007812 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007813 memcpy(out, buffer, outsize);
7814 out += outsize;
7815 continue;
7816 }
7817 }
7818 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7819 PyErr_SetFromWindowsErr(0);
7820 goto error;
7821 }
7822
Victor Stinner3a50e702011-10-18 21:21:00 +02007823 rep = unicode_encode_call_errorhandler(
7824 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007825 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007826 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007827 if (rep == NULL)
7828 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007829 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007830
7831 if (PyBytes_Check(rep)) {
7832 outsize = PyBytes_GET_SIZE(rep);
7833 if (outsize != 1) {
7834 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7835 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7836 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7837 Py_DECREF(rep);
7838 goto error;
7839 }
7840 out = PyBytes_AS_STRING(*outbytes) + offset;
7841 }
7842 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7843 out += outsize;
7844 }
7845 else {
7846 Py_ssize_t i;
7847 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007848 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007849
Benjamin Petersonbac79492012-01-14 13:34:47 -05007850 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007851 Py_DECREF(rep);
7852 goto error;
7853 }
7854
7855 outsize = PyUnicode_GET_LENGTH(rep);
7856 if (outsize != 1) {
7857 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7858 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7859 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7860 Py_DECREF(rep);
7861 goto error;
7862 }
7863 out = PyBytes_AS_STRING(*outbytes) + offset;
7864 }
7865 kind = PyUnicode_KIND(rep);
7866 data = PyUnicode_DATA(rep);
7867 for (i=0; i < outsize; i++) {
7868 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7869 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007870 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007871 encoding, unicode,
7872 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007873 "unable to encode error handler result to ASCII");
7874 Py_DECREF(rep);
7875 goto error;
7876 }
7877 *out = (unsigned char)ch;
7878 out++;
7879 }
7880 }
7881 Py_DECREF(rep);
7882 }
7883 /* write a NUL byte */
7884 *out = 0;
7885 outsize = out - PyBytes_AS_STRING(*outbytes);
7886 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7887 if (_PyBytes_Resize(outbytes, outsize) < 0)
7888 goto error;
7889 ret = 0;
7890
7891error:
7892 Py_XDECREF(encoding_obj);
7893 Py_XDECREF(errorHandler);
7894 Py_XDECREF(exc);
7895 return ret;
7896}
7897
Victor Stinner3a50e702011-10-18 21:21:00 +02007898static PyObject *
7899encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007900 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007901 const char *errors)
7902{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007903 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007904 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007905 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007906 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007907
Victor Stinner29dacf22015-01-26 16:41:32 +01007908 if (!PyUnicode_Check(unicode)) {
7909 PyErr_BadArgument();
7910 return NULL;
7911 }
7912
Benjamin Petersonbac79492012-01-14 13:34:47 -05007913 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007914 return NULL;
7915 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007916
Victor Stinner3a50e702011-10-18 21:21:00 +02007917 if (code_page < 0) {
7918 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7919 return NULL;
7920 }
7921
Martin v. Löwis3d325192011-11-04 18:23:06 +01007922 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007923 return PyBytes_FromStringAndSize(NULL, 0);
7924
Victor Stinner7581cef2011-11-03 22:32:33 +01007925 offset = 0;
7926 do
7927 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007928#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007929 if (len > DECODING_CHUNK_SIZE) {
7930 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007931 done = 0;
7932 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007933 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007934#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007935 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007936 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007937 done = 1;
7938 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007939
Victor Stinner76a31a62011-11-04 00:05:13 +01007940 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007941 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007942 errors);
7943 if (ret == -2)
7944 ret = encode_code_page_errors(code_page, &outbytes,
7945 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007946 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007947 if (ret < 0) {
7948 Py_XDECREF(outbytes);
7949 return NULL;
7950 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007951
Victor Stinner7581cef2011-11-03 22:32:33 +01007952 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007953 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007954 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007955
Victor Stinner3a50e702011-10-18 21:21:00 +02007956 return outbytes;
7957}
7958
7959PyObject *
7960PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7961 Py_ssize_t size,
7962 const char *errors)
7963{
Victor Stinner7581cef2011-11-03 22:32:33 +01007964 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007965 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007966 if (unicode == NULL)
7967 return NULL;
7968 res = encode_code_page(CP_ACP, unicode, errors);
7969 Py_DECREF(unicode);
7970 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007971}
7972
7973PyObject *
7974PyUnicode_EncodeCodePage(int code_page,
7975 PyObject *unicode,
7976 const char *errors)
7977{
Victor Stinner7581cef2011-11-03 22:32:33 +01007978 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007979}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007980
Alexander Belopolsky40018472011-02-26 01:02:56 +00007981PyObject *
7982PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007983{
Victor Stinner7581cef2011-11-03 22:32:33 +01007984 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007985}
7986
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007987#undef NEED_RETRY
7988
Steve Dowercc16be82016-09-08 10:35:16 -07007989#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007990
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991/* --- Character Mapping Codec -------------------------------------------- */
7992
Victor Stinnerfb161b12013-04-18 01:44:27 +02007993static int
7994charmap_decode_string(const char *s,
7995 Py_ssize_t size,
7996 PyObject *mapping,
7997 const char *errors,
7998 _PyUnicodeWriter *writer)
7999{
8000 const char *starts = s;
8001 const char *e;
8002 Py_ssize_t startinpos, endinpos;
8003 PyObject *errorHandler = NULL, *exc = NULL;
8004 Py_ssize_t maplen;
8005 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008006 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008007 Py_UCS4 x;
8008 unsigned char ch;
8009
8010 if (PyUnicode_READY(mapping) == -1)
8011 return -1;
8012
8013 maplen = PyUnicode_GET_LENGTH(mapping);
8014 mapdata = PyUnicode_DATA(mapping);
8015 mapkind = PyUnicode_KIND(mapping);
8016
8017 e = s + size;
8018
8019 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8020 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8021 * is disabled in encoding aliases, latin1 is preferred because
8022 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008023 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008024 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8025 Py_UCS4 maxchar = writer->maxchar;
8026
8027 assert (writer->kind == PyUnicode_1BYTE_KIND);
8028 while (s < e) {
8029 ch = *s;
8030 x = mapdata_ucs1[ch];
8031 if (x > maxchar) {
8032 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8033 goto onError;
8034 maxchar = writer->maxchar;
8035 outdata = (Py_UCS1 *)writer->data;
8036 }
8037 outdata[writer->pos] = x;
8038 writer->pos++;
8039 ++s;
8040 }
8041 return 0;
8042 }
8043
8044 while (s < e) {
8045 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8046 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008047 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008048 if (outkind == PyUnicode_1BYTE_KIND) {
8049 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8050 Py_UCS4 maxchar = writer->maxchar;
8051 while (s < e) {
8052 ch = *s;
8053 x = mapdata_ucs2[ch];
8054 if (x > maxchar)
8055 goto Error;
8056 outdata[writer->pos] = x;
8057 writer->pos++;
8058 ++s;
8059 }
8060 break;
8061 }
8062 else if (outkind == PyUnicode_2BYTE_KIND) {
8063 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8064 while (s < e) {
8065 ch = *s;
8066 x = mapdata_ucs2[ch];
8067 if (x == 0xFFFE)
8068 goto Error;
8069 outdata[writer->pos] = x;
8070 writer->pos++;
8071 ++s;
8072 }
8073 break;
8074 }
8075 }
8076 ch = *s;
8077
8078 if (ch < maplen)
8079 x = PyUnicode_READ(mapkind, mapdata, ch);
8080 else
8081 x = 0xfffe; /* invalid value */
8082Error:
8083 if (x == 0xfffe)
8084 {
8085 /* undefined mapping */
8086 startinpos = s-starts;
8087 endinpos = startinpos+1;
8088 if (unicode_decode_call_errorhandler_writer(
8089 errors, &errorHandler,
8090 "charmap", "character maps to <undefined>",
8091 &starts, &e, &startinpos, &endinpos, &exc, &s,
8092 writer)) {
8093 goto onError;
8094 }
8095 continue;
8096 }
8097
8098 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8099 goto onError;
8100 ++s;
8101 }
8102 Py_XDECREF(errorHandler);
8103 Py_XDECREF(exc);
8104 return 0;
8105
8106onError:
8107 Py_XDECREF(errorHandler);
8108 Py_XDECREF(exc);
8109 return -1;
8110}
8111
8112static int
8113charmap_decode_mapping(const char *s,
8114 Py_ssize_t size,
8115 PyObject *mapping,
8116 const char *errors,
8117 _PyUnicodeWriter *writer)
8118{
8119 const char *starts = s;
8120 const char *e;
8121 Py_ssize_t startinpos, endinpos;
8122 PyObject *errorHandler = NULL, *exc = NULL;
8123 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008124 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008125
8126 e = s + size;
8127
8128 while (s < e) {
8129 ch = *s;
8130
8131 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8132 key = PyLong_FromLong((long)ch);
8133 if (key == NULL)
8134 goto onError;
8135
8136 item = PyObject_GetItem(mapping, key);
8137 Py_DECREF(key);
8138 if (item == NULL) {
8139 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8140 /* No mapping found means: mapping is undefined. */
8141 PyErr_Clear();
8142 goto Undefined;
8143 } else
8144 goto onError;
8145 }
8146
8147 /* Apply mapping */
8148 if (item == Py_None)
8149 goto Undefined;
8150 if (PyLong_Check(item)) {
8151 long value = PyLong_AS_LONG(item);
8152 if (value == 0xFFFE)
8153 goto Undefined;
8154 if (value < 0 || value > MAX_UNICODE) {
8155 PyErr_Format(PyExc_TypeError,
8156 "character mapping must be in range(0x%lx)",
8157 (unsigned long)MAX_UNICODE + 1);
8158 goto onError;
8159 }
8160
8161 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8162 goto onError;
8163 }
8164 else if (PyUnicode_Check(item)) {
8165 if (PyUnicode_READY(item) == -1)
8166 goto onError;
8167 if (PyUnicode_GET_LENGTH(item) == 1) {
8168 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8169 if (value == 0xFFFE)
8170 goto Undefined;
8171 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8172 goto onError;
8173 }
8174 else {
8175 writer->overallocate = 1;
8176 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8177 goto onError;
8178 }
8179 }
8180 else {
8181 /* wrong return value */
8182 PyErr_SetString(PyExc_TypeError,
8183 "character mapping must return integer, None or str");
8184 goto onError;
8185 }
8186 Py_CLEAR(item);
8187 ++s;
8188 continue;
8189
8190Undefined:
8191 /* undefined mapping */
8192 Py_CLEAR(item);
8193 startinpos = s-starts;
8194 endinpos = startinpos+1;
8195 if (unicode_decode_call_errorhandler_writer(
8196 errors, &errorHandler,
8197 "charmap", "character maps to <undefined>",
8198 &starts, &e, &startinpos, &endinpos, &exc, &s,
8199 writer)) {
8200 goto onError;
8201 }
8202 }
8203 Py_XDECREF(errorHandler);
8204 Py_XDECREF(exc);
8205 return 0;
8206
8207onError:
8208 Py_XDECREF(item);
8209 Py_XDECREF(errorHandler);
8210 Py_XDECREF(exc);
8211 return -1;
8212}
8213
Alexander Belopolsky40018472011-02-26 01:02:56 +00008214PyObject *
8215PyUnicode_DecodeCharmap(const char *s,
8216 Py_ssize_t size,
8217 PyObject *mapping,
8218 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008220 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008221
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 /* Default to Latin-1 */
8223 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008227 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008228 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008229 writer.min_length = size;
8230 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008232
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008233 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008234 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8235 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008236 }
8237 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008238 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8239 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008241 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008242
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008244 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 return NULL;
8246}
8247
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008248/* Charmap encoding: the lookup table */
8249
Alexander Belopolsky40018472011-02-26 01:02:56 +00008250struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 PyObject_HEAD
8252 unsigned char level1[32];
8253 int count2, count3;
8254 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008255};
8256
8257static PyObject*
8258encoding_map_size(PyObject *obj, PyObject* args)
8259{
8260 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008261 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008263}
8264
8265static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008266 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 PyDoc_STR("Return the size (in bytes) of this object") },
8268 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008269};
8270
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008271static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 "EncodingMap", /*tp_name*/
8274 sizeof(struct encoding_map), /*tp_basicsize*/
8275 0, /*tp_itemsize*/
8276 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008277 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008278 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 0, /*tp_getattr*/
8280 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008281 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 0, /*tp_repr*/
8283 0, /*tp_as_number*/
8284 0, /*tp_as_sequence*/
8285 0, /*tp_as_mapping*/
8286 0, /*tp_hash*/
8287 0, /*tp_call*/
8288 0, /*tp_str*/
8289 0, /*tp_getattro*/
8290 0, /*tp_setattro*/
8291 0, /*tp_as_buffer*/
8292 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8293 0, /*tp_doc*/
8294 0, /*tp_traverse*/
8295 0, /*tp_clear*/
8296 0, /*tp_richcompare*/
8297 0, /*tp_weaklistoffset*/
8298 0, /*tp_iter*/
8299 0, /*tp_iternext*/
8300 encoding_map_methods, /*tp_methods*/
8301 0, /*tp_members*/
8302 0, /*tp_getset*/
8303 0, /*tp_base*/
8304 0, /*tp_dict*/
8305 0, /*tp_descr_get*/
8306 0, /*tp_descr_set*/
8307 0, /*tp_dictoffset*/
8308 0, /*tp_init*/
8309 0, /*tp_alloc*/
8310 0, /*tp_new*/
8311 0, /*tp_free*/
8312 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008313};
8314
8315PyObject*
8316PyUnicode_BuildEncodingMap(PyObject* string)
8317{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008318 PyObject *result;
8319 struct encoding_map *mresult;
8320 int i;
8321 int need_dict = 0;
8322 unsigned char level1[32];
8323 unsigned char level2[512];
8324 unsigned char *mlevel1, *mlevel2, *mlevel3;
8325 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008326 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008327 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008328 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008331 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 PyErr_BadArgument();
8333 return NULL;
8334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 kind = PyUnicode_KIND(string);
8336 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008337 length = PyUnicode_GET_LENGTH(string);
8338 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008339 memset(level1, 0xFF, sizeof level1);
8340 memset(level2, 0xFF, sizeof level2);
8341
8342 /* If there isn't a one-to-one mapping of NULL to \0,
8343 or if there are non-BMP characters, we need to use
8344 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008345 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008346 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008347 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349 ch = PyUnicode_READ(kind, data, i);
8350 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008351 need_dict = 1;
8352 break;
8353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008354 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008355 /* unmapped character */
8356 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008357 l1 = ch >> 11;
8358 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008359 if (level1[l1] == 0xFF)
8360 level1[l1] = count2++;
8361 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008362 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363 }
8364
8365 if (count2 >= 0xFF || count3 >= 0xFF)
8366 need_dict = 1;
8367
8368 if (need_dict) {
8369 PyObject *result = PyDict_New();
8370 PyObject *key, *value;
8371 if (!result)
8372 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008373 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008375 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008376 if (!key || !value)
8377 goto failed1;
8378 if (PyDict_SetItem(result, key, value) == -1)
8379 goto failed1;
8380 Py_DECREF(key);
8381 Py_DECREF(value);
8382 }
8383 return result;
8384 failed1:
8385 Py_XDECREF(key);
8386 Py_XDECREF(value);
8387 Py_DECREF(result);
8388 return NULL;
8389 }
8390
8391 /* Create a three-level trie */
8392 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8393 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008394 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008395 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008396 }
8397
8398 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008399 mresult = (struct encoding_map*)result;
8400 mresult->count2 = count2;
8401 mresult->count3 = count3;
8402 mlevel1 = mresult->level1;
8403 mlevel2 = mresult->level23;
8404 mlevel3 = mresult->level23 + 16*count2;
8405 memcpy(mlevel1, level1, 32);
8406 memset(mlevel2, 0xFF, 16*count2);
8407 memset(mlevel3, 0, 128*count3);
8408 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008409 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008410 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8412 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008413 /* unmapped character */
8414 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008415 o1 = ch>>11;
8416 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008417 i2 = 16*mlevel1[o1] + o2;
8418 if (mlevel2[i2] == 0xFF)
8419 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008420 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008421 i3 = 128*mlevel2[i2] + o3;
8422 mlevel3[i3] = i;
8423 }
8424 return result;
8425}
8426
8427static int
Victor Stinner22168992011-11-20 17:09:18 +01008428encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008429{
8430 struct encoding_map *map = (struct encoding_map*)mapping;
8431 int l1 = c>>11;
8432 int l2 = (c>>7) & 0xF;
8433 int l3 = c & 0x7F;
8434 int i;
8435
Victor Stinner22168992011-11-20 17:09:18 +01008436 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008438 if (c == 0)
8439 return 0;
8440 /* level 1*/
8441 i = map->level1[l1];
8442 if (i == 0xFF) {
8443 return -1;
8444 }
8445 /* level 2*/
8446 i = map->level23[16*i+l2];
8447 if (i == 0xFF) {
8448 return -1;
8449 }
8450 /* level 3 */
8451 i = map->level23[16*map->count2 + 128*i + l3];
8452 if (i == 0) {
8453 return -1;
8454 }
8455 return i;
8456}
8457
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458/* Lookup the character ch in the mapping. If the character
8459 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008460 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008461static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008462charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463{
Christian Heimes217cfd12007-12-02 14:31:20 +00008464 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465 PyObject *x;
8466
8467 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469 x = PyObject_GetItem(mapping, w);
8470 Py_DECREF(w);
8471 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8473 /* No mapping found means: mapping is undefined. */
8474 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008475 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 } else
8477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008479 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008481 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 long value = PyLong_AS_LONG(x);
8483 if (value < 0 || value > 255) {
8484 PyErr_SetString(PyExc_TypeError,
8485 "character mapping must be in range(256)");
8486 Py_DECREF(x);
8487 return NULL;
8488 }
8489 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008491 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 /* wrong return value */
8495 PyErr_Format(PyExc_TypeError,
8496 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008497 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 Py_DECREF(x);
8499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 }
8501}
8502
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008503static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008504charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008505{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8507 /* exponentially overallocate to minimize reallocations */
8508 if (requiredsize < 2*outsize)
8509 requiredsize = 2*outsize;
8510 if (_PyBytes_Resize(outobj, requiredsize))
8511 return -1;
8512 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008513}
8514
Benjamin Peterson14339b62009-01-31 16:36:08 +00008515typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008517} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008518/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008519 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 space is available. Return a new reference to the object that
8521 was put in the output buffer, or Py_None, if the mapping was undefined
8522 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008523 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008524static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008525charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008526 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008527{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008528 PyObject *rep;
8529 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008530 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531
Andy Lesterdffe4c02020-03-04 07:15:20 -06008532 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008533 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008535 if (res == -1)
8536 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 if (outsize<requiredsize)
8538 if (charmapencode_resize(outobj, outpos, requiredsize))
8539 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008540 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 outstart[(*outpos)++] = (char)res;
8542 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008543 }
8544
8545 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008548 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 Py_DECREF(rep);
8550 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008551 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 if (PyLong_Check(rep)) {
8553 Py_ssize_t requiredsize = *outpos+1;
8554 if (outsize<requiredsize)
8555 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8556 Py_DECREF(rep);
8557 return enc_EXCEPTION;
8558 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008559 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008561 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 else {
8563 const char *repchars = PyBytes_AS_STRING(rep);
8564 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8565 Py_ssize_t requiredsize = *outpos+repsize;
8566 if (outsize<requiredsize)
8567 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8568 Py_DECREF(rep);
8569 return enc_EXCEPTION;
8570 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008571 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 memcpy(outstart + *outpos, repchars, repsize);
8573 *outpos += repsize;
8574 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008576 Py_DECREF(rep);
8577 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578}
8579
8580/* handle an error in PyUnicode_EncodeCharmap
8581 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008582static int
8583charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008584 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008586 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008587 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588{
8589 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008590 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008591 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008592 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008593 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008594 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008596 Py_ssize_t collstartpos = *inpos;
8597 Py_ssize_t collendpos = *inpos+1;
8598 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008599 const char *encoding = "charmap";
8600 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008601 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008602 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008603 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008604
Benjamin Petersonbac79492012-01-14 13:34:47 -05008605 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008606 return -1;
8607 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608 /* find all unencodable characters */
8609 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008610 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008611 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008612 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008613 val = encoding_map_lookup(ch, mapping);
8614 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 break;
8616 ++collendpos;
8617 continue;
8618 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008619
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008620 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8621 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 if (rep==NULL)
8623 return -1;
8624 else if (rep!=Py_None) {
8625 Py_DECREF(rep);
8626 break;
8627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008628 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630 }
8631 /* cache callback name lookup
8632 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008633 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008634 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008635
8636 switch (*error_handler) {
8637 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008638 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008639 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008640
8641 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008642 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 x = charmapencode_output('?', mapping, res, respos);
8644 if (x==enc_EXCEPTION) {
8645 return -1;
8646 }
8647 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008648 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 return -1;
8650 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008651 }
8652 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008653 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008654 *inpos = collendpos;
8655 break;
Victor Stinner50149202015-09-22 00:26:54 +02008656
8657 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008658 /* generate replacement (temporarily (mis)uses p) */
8659 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 char buffer[2+29+1+1];
8661 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008662 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 for (cp = buffer; *cp; ++cp) {
8664 x = charmapencode_output(*cp, mapping, res, respos);
8665 if (x==enc_EXCEPTION)
8666 return -1;
8667 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008668 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 return -1;
8670 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008671 }
8672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008673 *inpos = collendpos;
8674 break;
Victor Stinner50149202015-09-22 00:26:54 +02008675
Benjamin Peterson14339b62009-01-31 16:36:08 +00008676 default:
Victor Stinner50149202015-09-22 00:26:54 +02008677 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008678 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008680 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008682 if (PyBytes_Check(repunicode)) {
8683 /* Directly copy bytes result to output. */
8684 Py_ssize_t outsize = PyBytes_Size(*res);
8685 Py_ssize_t requiredsize;
8686 repsize = PyBytes_Size(repunicode);
8687 requiredsize = *respos + repsize;
8688 if (requiredsize > outsize)
8689 /* Make room for all additional bytes. */
8690 if (charmapencode_resize(res, respos, requiredsize)) {
8691 Py_DECREF(repunicode);
8692 return -1;
8693 }
8694 memcpy(PyBytes_AsString(*res) + *respos,
8695 PyBytes_AsString(repunicode), repsize);
8696 *respos += repsize;
8697 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008698 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008699 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008701 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008702 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008703 Py_DECREF(repunicode);
8704 return -1;
8705 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008706 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008707 data = PyUnicode_DATA(repunicode);
8708 kind = PyUnicode_KIND(repunicode);
8709 for (index = 0; index < repsize; index++) {
8710 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8711 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008713 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 return -1;
8715 }
8716 else if (x==enc_FAILED) {
8717 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008718 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 return -1;
8720 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008721 }
8722 *inpos = newpos;
8723 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 }
8725 return 0;
8726}
8727
Alexander Belopolsky40018472011-02-26 01:02:56 +00008728PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008729_PyUnicode_EncodeCharmap(PyObject *unicode,
8730 PyObject *mapping,
8731 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733 /* output object */
8734 PyObject *res = NULL;
8735 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008736 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008737 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008739 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008740 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008742 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008743 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008744 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745
Benjamin Petersonbac79492012-01-14 13:34:47 -05008746 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008747 return NULL;
8748 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008749 data = PyUnicode_DATA(unicode);
8750 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008751
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 /* Default to Latin-1 */
8753 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008754 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008756 /* allocate enough for a simple encoding without
8757 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008758 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008759 if (res == NULL)
8760 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008761 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008765 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008767 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 if (x==enc_EXCEPTION) /* error */
8769 goto onError;
8770 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008771 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008773 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 &res, &respos)) {
8775 goto onError;
8776 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008777 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 else
8779 /* done with this character => adjust input position */
8780 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008784 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008785 if (_PyBytes_Resize(&res, respos) < 0)
8786 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008788 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008789 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008790 return res;
8791
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008793 Py_XDECREF(res);
8794 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008795 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 return NULL;
8797}
8798
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008799/* Deprecated */
8800PyObject *
8801PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8802 Py_ssize_t size,
8803 PyObject *mapping,
8804 const char *errors)
8805{
8806 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008807 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008808 if (unicode == NULL)
8809 return NULL;
8810 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8811 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008812 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008813}
8814
Alexander Belopolsky40018472011-02-26 01:02:56 +00008815PyObject *
8816PyUnicode_AsCharmapString(PyObject *unicode,
8817 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818{
8819 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008820 PyErr_BadArgument();
8821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008823 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824}
8825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008826/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008827static void
8828make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008830 Py_ssize_t startpos, Py_ssize_t endpos,
8831 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008833 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 *exceptionObject = _PyUnicodeTranslateError_Create(
8835 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836 }
8837 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008838 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8839 goto onError;
8840 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8841 goto onError;
8842 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8843 goto onError;
8844 return;
8845 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008846 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 }
8848}
8849
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008850/* error handling callback helper:
8851 build arguments, call the callback and check the arguments,
8852 put the result into newpos and return the replacement string, which
8853 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008854static PyObject *
8855unicode_translate_call_errorhandler(const char *errors,
8856 PyObject **errorHandler,
8857 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008859 Py_ssize_t startpos, Py_ssize_t endpos,
8860 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008861{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008862 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008863
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008864 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008865 PyObject *restuple;
8866 PyObject *resunicode;
8867
8868 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008872 }
8873
8874 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008876 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008878
Petr Viktorinffd97532020-02-11 17:46:57 +01008879 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008880 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008882 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008883 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 Py_DECREF(restuple);
8885 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008886 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008887 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 &resunicode, &i_newpos)) {
8889 Py_DECREF(restuple);
8890 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008891 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008892 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008894 else
8895 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008897 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 Py_DECREF(restuple);
8899 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008900 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008901 Py_INCREF(resunicode);
8902 Py_DECREF(restuple);
8903 return resunicode;
8904}
8905
8906/* Lookup the character ch in the mapping and put the result in result,
8907 which must be decrefed by the caller.
8908 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008909static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008911{
Christian Heimes217cfd12007-12-02 14:31:20 +00008912 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008913 PyObject *x;
8914
8915 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008917 x = PyObject_GetItem(mapping, w);
8918 Py_DECREF(w);
8919 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8921 /* No mapping found means: use 1:1 mapping. */
8922 PyErr_Clear();
8923 *result = NULL;
8924 return 0;
8925 } else
8926 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008927 }
8928 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 *result = x;
8930 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008931 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008932 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008934 if (value < 0 || value > MAX_UNICODE) {
8935 PyErr_Format(PyExc_ValueError,
8936 "character mapping must be in range(0x%x)",
8937 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 Py_DECREF(x);
8939 return -1;
8940 }
8941 *result = x;
8942 return 0;
8943 }
8944 else if (PyUnicode_Check(x)) {
8945 *result = x;
8946 return 0;
8947 }
8948 else {
8949 /* wrong return value */
8950 PyErr_SetString(PyExc_TypeError,
8951 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008952 Py_DECREF(x);
8953 return -1;
8954 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008955}
Victor Stinner1194ea02014-04-04 19:37:40 +02008956
8957/* lookup the character, write the result into the writer.
8958 Return 1 if the result was written into the writer, return 0 if the mapping
8959 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008960static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008961charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8962 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008963{
Victor Stinner1194ea02014-04-04 19:37:40 +02008964 PyObject *item;
8965
8966 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008968
8969 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008970 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008971 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008974 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008975 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008976
8977 if (item == Py_None) {
8978 Py_DECREF(item);
8979 return 0;
8980 }
8981
8982 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008983 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8984 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8985 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008986 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8987 Py_DECREF(item);
8988 return -1;
8989 }
8990 Py_DECREF(item);
8991 return 1;
8992 }
8993
8994 if (!PyUnicode_Check(item)) {
8995 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008997 }
8998
8999 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9000 Py_DECREF(item);
9001 return -1;
9002 }
9003
9004 Py_DECREF(item);
9005 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009006}
9007
Victor Stinner89a76ab2014-04-05 11:44:04 +02009008static int
9009unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9010 Py_UCS1 *translate)
9011{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009012 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009013 int ret = 0;
9014
Victor Stinner89a76ab2014-04-05 11:44:04 +02009015 if (charmaptranslate_lookup(ch, mapping, &item)) {
9016 return -1;
9017 }
9018
9019 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009020 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009021 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009022 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009023 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009024 /* not found => default to 1:1 mapping */
9025 translate[ch] = ch;
9026 return 1;
9027 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009028 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009029 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009030 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9031 used it */
9032 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009033 /* invalid character or character outside ASCII:
9034 skip the fast translate */
9035 goto exit;
9036 }
9037 translate[ch] = (Py_UCS1)replace;
9038 }
9039 else if (PyUnicode_Check(item)) {
9040 Py_UCS4 replace;
9041
9042 if (PyUnicode_READY(item) == -1) {
9043 Py_DECREF(item);
9044 return -1;
9045 }
9046 if (PyUnicode_GET_LENGTH(item) != 1)
9047 goto exit;
9048
9049 replace = PyUnicode_READ_CHAR(item, 0);
9050 if (replace > 127)
9051 goto exit;
9052 translate[ch] = (Py_UCS1)replace;
9053 }
9054 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009055 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009056 goto exit;
9057 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009058 ret = 1;
9059
Benjamin Peterson1365de72014-04-07 20:15:41 -04009060 exit:
9061 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009062 return ret;
9063}
9064
9065/* Fast path for ascii => ascii translation. Return 1 if the whole string
9066 was translated into writer, return 0 if the input string was partially
9067 translated into writer, raise an exception and return -1 on error. */
9068static int
9069unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009070 _PyUnicodeWriter *writer, int ignore,
9071 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009072{
Victor Stinner872b2912014-04-05 14:27:07 +02009073 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009074 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009075 const Py_UCS1 *in, *end;
9076 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009077 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009078
Victor Stinner89a76ab2014-04-05 11:44:04 +02009079 len = PyUnicode_GET_LENGTH(input);
9080
Victor Stinner872b2912014-04-05 14:27:07 +02009081 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009082
9083 in = PyUnicode_1BYTE_DATA(input);
9084 end = in + len;
9085
9086 assert(PyUnicode_IS_ASCII(writer->buffer));
9087 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9088 out = PyUnicode_1BYTE_DATA(writer->buffer);
9089
Victor Stinner872b2912014-04-05 14:27:07 +02009090 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009091 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009092 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009093 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009094 int translate = unicode_fast_translate_lookup(mapping, ch,
9095 ascii_table);
9096 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009097 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009098 if (translate == 0)
9099 goto exit;
9100 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009101 }
Victor Stinner872b2912014-04-05 14:27:07 +02009102 if (ch2 == 0xfe) {
9103 if (ignore)
9104 continue;
9105 goto exit;
9106 }
9107 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009108 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009109 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009110 }
Victor Stinner872b2912014-04-05 14:27:07 +02009111 res = 1;
9112
9113exit:
9114 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009115 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009116 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009117}
9118
Victor Stinner3222da22015-10-01 22:07:32 +02009119static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120_PyUnicode_TranslateCharmap(PyObject *input,
9121 PyObject *mapping,
9122 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009125 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 Py_ssize_t size, i;
9127 int kind;
9128 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009129 _PyUnicodeWriter writer;
9130 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009131 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009132 PyObject *errorHandler = NULL;
9133 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009134 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009135 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009136
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 PyErr_BadArgument();
9139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 if (PyUnicode_READY(input) == -1)
9143 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009144 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 kind = PyUnicode_KIND(input);
9146 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009148 if (size == 0)
9149 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009151 /* allocate enough for a simple 1:1 translation without
9152 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009153 _PyUnicodeWriter_Init(&writer);
9154 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156
Victor Stinner872b2912014-04-05 14:27:07 +02009157 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9158
Victor Stinner33798672016-03-01 21:59:58 +01009159 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009160 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009161 if (PyUnicode_IS_ASCII(input)) {
9162 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9163 if (res < 0) {
9164 _PyUnicodeWriter_Dealloc(&writer);
9165 return NULL;
9166 }
9167 if (res == 1)
9168 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009169 }
Victor Stinner33798672016-03-01 21:59:58 +01009170 else {
9171 i = 0;
9172 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009176 int translate;
9177 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9178 Py_ssize_t newpos;
9179 /* startpos for collecting untranslatable chars */
9180 Py_ssize_t collstart;
9181 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009182 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183
Victor Stinner1194ea02014-04-04 19:37:40 +02009184 ch = PyUnicode_READ(kind, data, i);
9185 translate = charmaptranslate_output(ch, mapping, &writer);
9186 if (translate < 0)
9187 goto onError;
9188
9189 if (translate != 0) {
9190 /* it worked => adjust input pointer */
9191 ++i;
9192 continue;
9193 }
9194
9195 /* untranslatable character */
9196 collstart = i;
9197 collend = i+1;
9198
9199 /* find all untranslatable characters */
9200 while (collend < size) {
9201 PyObject *x;
9202 ch = PyUnicode_READ(kind, data, collend);
9203 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009204 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009205 Py_XDECREF(x);
9206 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009208 ++collend;
9209 }
9210
9211 if (ignore) {
9212 i = collend;
9213 }
9214 else {
9215 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9216 reason, input, &exc,
9217 collstart, collend, &newpos);
9218 if (repunicode == NULL)
9219 goto onError;
9220 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009221 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009222 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009223 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009224 Py_DECREF(repunicode);
9225 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009226 }
9227 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009228 Py_XDECREF(exc);
9229 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009230 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231
Benjamin Peterson29060642009-01-31 22:14:21 +00009232 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009233 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009234 Py_XDECREF(exc);
9235 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236 return NULL;
9237}
9238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239/* Deprecated. Use PyUnicode_Translate instead. */
9240PyObject *
9241PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9242 Py_ssize_t size,
9243 PyObject *mapping,
9244 const char *errors)
9245{
Christian Heimes5f520f42012-09-11 14:03:25 +02009246 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009247 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 if (!unicode)
9249 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009250 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9251 Py_DECREF(unicode);
9252 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253}
9254
Alexander Belopolsky40018472011-02-26 01:02:56 +00009255PyObject *
9256PyUnicode_Translate(PyObject *str,
9257 PyObject *mapping,
9258 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009260 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009261 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009262 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263}
Tim Petersced69f82003-09-16 20:30:58 +00009264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265PyObject *
9266_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9267{
9268 if (!PyUnicode_Check(unicode)) {
9269 PyErr_BadInternalCall();
9270 return NULL;
9271 }
9272 if (PyUnicode_READY(unicode) == -1)
9273 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009274 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 /* If the string is already ASCII, just return the same string */
9276 Py_INCREF(unicode);
9277 return unicode;
9278 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009279
9280 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9281 PyObject *result = PyUnicode_New(len, 127);
9282 if (result == NULL) {
9283 return NULL;
9284 }
9285
9286 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9287 int kind = PyUnicode_KIND(unicode);
9288 const void *data = PyUnicode_DATA(unicode);
9289 Py_ssize_t i;
9290 for (i = 0; i < len; ++i) {
9291 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9292 if (ch < 127) {
9293 out[i] = ch;
9294 }
9295 else if (Py_UNICODE_ISSPACE(ch)) {
9296 out[i] = ' ';
9297 }
9298 else {
9299 int decimal = Py_UNICODE_TODECIMAL(ch);
9300 if (decimal < 0) {
9301 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009302 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009303 _PyUnicode_LENGTH(result) = i + 1;
9304 break;
9305 }
9306 out[i] = '0' + decimal;
9307 }
9308 }
9309
INADA Naoki16dfca42018-07-14 12:06:43 +09009310 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009311 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312}
9313
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009314PyObject *
9315PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9316 Py_ssize_t length)
9317{
Victor Stinnerf0124502011-11-21 23:12:56 +01009318 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009319 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009320 Py_UCS4 maxchar;
9321 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009322 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009323
Victor Stinner99d7ad02012-02-22 13:37:39 +01009324 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009325 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009326 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009327 if (ch > 127) {
9328 int decimal = Py_UNICODE_TODECIMAL(ch);
9329 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009330 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009331 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009332 }
9333 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009334
9335 /* Copy to a new string */
9336 decimal = PyUnicode_New(length, maxchar);
9337 if (decimal == NULL)
9338 return decimal;
9339 kind = PyUnicode_KIND(decimal);
9340 data = PyUnicode_DATA(decimal);
9341 /* Iterate over code points */
9342 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009343 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009344 if (ch > 127) {
9345 int decimal = Py_UNICODE_TODECIMAL(ch);
9346 if (decimal >= 0)
9347 ch = '0' + decimal;
9348 }
9349 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009351 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009352}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009353/* --- Decimal Encoder ---------------------------------------------------- */
9354
Alexander Belopolsky40018472011-02-26 01:02:56 +00009355int
9356PyUnicode_EncodeDecimal(Py_UNICODE *s,
9357 Py_ssize_t length,
9358 char *output,
9359 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009360{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009361 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009362 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009363 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009364 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009365
9366 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 PyErr_BadArgument();
9368 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009369 }
9370
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009371 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009372 if (unicode == NULL)
9373 return -1;
9374
Victor Stinner42bf7752011-11-21 22:52:58 +01009375 kind = PyUnicode_KIND(unicode);
9376 data = PyUnicode_DATA(unicode);
9377
Victor Stinnerb84d7232011-11-22 01:50:07 +01009378 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009379 PyObject *exc;
9380 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009381 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009382 Py_ssize_t startpos;
9383
9384 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009385
Benjamin Peterson29060642009-01-31 22:14:21 +00009386 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009387 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009388 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009390 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 decimal = Py_UNICODE_TODECIMAL(ch);
9392 if (decimal >= 0) {
9393 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009394 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009395 continue;
9396 }
9397 if (0 < ch && ch < 256) {
9398 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009399 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009400 continue;
9401 }
Victor Stinner6345be92011-11-25 20:09:01 +01009402
Victor Stinner42bf7752011-11-21 22:52:58 +01009403 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009404 exc = NULL;
9405 raise_encode_exception(&exc, "decimal", unicode,
9406 startpos, startpos+1,
9407 "invalid decimal Unicode string");
9408 Py_XDECREF(exc);
9409 Py_DECREF(unicode);
9410 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009411 }
9412 /* 0-terminate the output string */
9413 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009414 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009415 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009416}
9417
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418/* --- Helpers ------------------------------------------------------------ */
9419
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009420/* helper macro to fixup start/end slice values */
9421#define ADJUST_INDICES(start, end, len) \
9422 if (end > len) \
9423 end = len; \
9424 else if (end < 0) { \
9425 end += len; \
9426 if (end < 0) \
9427 end = 0; \
9428 } \
9429 if (start < 0) { \
9430 start += len; \
9431 if (start < 0) \
9432 start = 0; \
9433 }
9434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009436any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009438 Py_ssize_t end,
9439 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009441 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009442 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 Py_ssize_t len1, len2, result;
9444
9445 kind1 = PyUnicode_KIND(s1);
9446 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009447 if (kind1 < kind2)
9448 return -1;
9449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 len1 = PyUnicode_GET_LENGTH(s1);
9451 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009452 ADJUST_INDICES(start, end, len1);
9453 if (end - start < len2)
9454 return -1;
9455
9456 buf1 = PyUnicode_DATA(s1);
9457 buf2 = PyUnicode_DATA(s2);
9458 if (len2 == 1) {
9459 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9460 result = findchar((const char *)buf1 + kind1*start,
9461 kind1, end - start, ch, direction);
9462 if (result == -1)
9463 return -1;
9464 else
9465 return start + result;
9466 }
9467
9468 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009469 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009470 if (!buf2)
9471 return -2;
9472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473
Victor Stinner794d5672011-10-10 03:21:36 +02009474 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009475 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009476 case PyUnicode_1BYTE_KIND:
9477 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9478 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9479 else
9480 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9481 break;
9482 case PyUnicode_2BYTE_KIND:
9483 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9484 break;
9485 case PyUnicode_4BYTE_KIND:
9486 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9487 break;
9488 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009489 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009490 }
9491 }
9492 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009493 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009494 case PyUnicode_1BYTE_KIND:
9495 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9496 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9497 else
9498 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9499 break;
9500 case PyUnicode_2BYTE_KIND:
9501 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9502 break;
9503 case PyUnicode_4BYTE_KIND:
9504 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9505 break;
9506 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009507 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 }
9510
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009511 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009512 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009513 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514
9515 return result;
9516}
9517
Victor Stinner59423e32018-11-26 13:40:01 +01009518/* _PyUnicode_InsertThousandsGrouping() helper functions */
9519#include "stringlib/localeutil.h"
9520
9521/**
9522 * InsertThousandsGrouping:
9523 * @writer: Unicode writer.
9524 * @n_buffer: Number of characters in @buffer.
9525 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9526 * @d_pos: Start of digits string.
9527 * @n_digits: The number of digits in the string, in which we want
9528 * to put the grouping chars.
9529 * @min_width: The minimum width of the digits in the output string.
9530 * Output will be zero-padded on the left to fill.
9531 * @grouping: see definition in localeconv().
9532 * @thousands_sep: see definition in localeconv().
9533 *
9534 * There are 2 modes: counting and filling. If @writer is NULL,
9535 * we are in counting mode, else filling mode.
9536 * If counting, the required buffer size is returned.
9537 * If filling, we know the buffer will be large enough, so we don't
9538 * need to pass in the buffer size.
9539 * Inserts thousand grouping characters (as defined by grouping and
9540 * thousands_sep) into @writer.
9541 *
9542 * Return value: -1 on error, number of characters otherwise.
9543 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009545_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009546 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009547 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009548 PyObject *digits,
9549 Py_ssize_t d_pos,
9550 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009551 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009552 const char *grouping,
9553 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009554 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555{
Xtreak3f7983a2019-01-07 20:39:14 +05309556 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009557 if (writer) {
9558 assert(digits != NULL);
9559 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009560 }
9561 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009562 assert(digits == NULL);
9563 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009564 }
Victor Stinner59423e32018-11-26 13:40:01 +01009565 assert(0 <= d_pos);
9566 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009567 assert(grouping != NULL);
9568
9569 if (digits != NULL) {
9570 if (PyUnicode_READY(digits) == -1) {
9571 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009572 }
Victor Stinner59423e32018-11-26 13:40:01 +01009573 }
9574 if (PyUnicode_READY(thousands_sep) == -1) {
9575 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009576 }
9577
Victor Stinner59423e32018-11-26 13:40:01 +01009578 Py_ssize_t count = 0;
9579 Py_ssize_t n_zeros;
9580 int loop_broken = 0;
9581 int use_separator = 0; /* First time through, don't append the
9582 separator. They only go between
9583 groups. */
9584 Py_ssize_t buffer_pos;
9585 Py_ssize_t digits_pos;
9586 Py_ssize_t len;
9587 Py_ssize_t n_chars;
9588 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9589 be looked at */
9590 /* A generator that returns all of the grouping widths, until it
9591 returns 0. */
9592 GroupGenerator groupgen;
9593 GroupGenerator_init(&groupgen, grouping);
9594 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9595
9596 /* if digits are not grouped, thousands separator
9597 should be an empty string */
9598 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9599
9600 digits_pos = d_pos + n_digits;
9601 if (writer) {
9602 buffer_pos = writer->pos + n_buffer;
9603 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9604 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 }
Victor Stinner59423e32018-11-26 13:40:01 +01009606 else {
9607 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009608 }
Victor Stinner59423e32018-11-26 13:40:01 +01009609
9610 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009611 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009612 }
Victor Stinner59423e32018-11-26 13:40:01 +01009613
9614 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9615 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9616 n_zeros = Py_MAX(0, len - remaining);
9617 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9618
9619 /* Use n_zero zero's and n_chars chars */
9620
9621 /* Count only, don't do anything. */
9622 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9623
9624 /* Copy into the writer. */
9625 InsertThousandsGrouping_fill(writer, &buffer_pos,
9626 digits, &digits_pos,
9627 n_chars, n_zeros,
9628 use_separator ? thousands_sep : NULL,
9629 thousands_sep_len, maxchar);
9630
9631 /* Use a separator next time. */
9632 use_separator = 1;
9633
9634 remaining -= n_chars;
9635 min_width -= len;
9636
9637 if (remaining <= 0 && min_width <= 0) {
9638 loop_broken = 1;
9639 break;
9640 }
9641 min_width -= thousands_sep_len;
9642 }
9643 if (!loop_broken) {
9644 /* We left the loop without using a break statement. */
9645
9646 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9647 n_zeros = Py_MAX(0, len - remaining);
9648 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9649
9650 /* Use n_zero zero's and n_chars chars */
9651 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9652
9653 /* Copy into the writer. */
9654 InsertThousandsGrouping_fill(writer, &buffer_pos,
9655 digits, &digits_pos,
9656 n_chars, n_zeros,
9657 use_separator ? thousands_sep : NULL,
9658 thousands_sep_len, maxchar);
9659 }
9660 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661}
9662
9663
Alexander Belopolsky40018472011-02-26 01:02:56 +00009664Py_ssize_t
9665PyUnicode_Count(PyObject *str,
9666 PyObject *substr,
9667 Py_ssize_t start,
9668 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009670 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009671 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009672 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009674
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009675 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009676 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009677
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009678 kind1 = PyUnicode_KIND(str);
9679 kind2 = PyUnicode_KIND(substr);
9680 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009681 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009682
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009683 len1 = PyUnicode_GET_LENGTH(str);
9684 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009686 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009687 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009688
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009689 buf1 = PyUnicode_DATA(str);
9690 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009691 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009692 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009693 if (!buf2)
9694 goto onError;
9695 }
9696
9697 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009699 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009700 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009701 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009702 buf2, len2, PY_SSIZE_T_MAX
9703 );
9704 else
9705 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009706 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009707 buf2, len2, PY_SSIZE_T_MAX
9708 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 break;
9710 case PyUnicode_2BYTE_KIND:
9711 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009712 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 buf2, len2, PY_SSIZE_T_MAX
9714 );
9715 break;
9716 case PyUnicode_4BYTE_KIND:
9717 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009718 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719 buf2, len2, PY_SSIZE_T_MAX
9720 );
9721 break;
9722 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009723 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009725
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009726 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009727 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009728 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009732 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9733 if (kind2 != kind1)
9734 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736}
9737
Alexander Belopolsky40018472011-02-26 01:02:56 +00009738Py_ssize_t
9739PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009740 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009741 Py_ssize_t start,
9742 Py_ssize_t end,
9743 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009744{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009745 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009746 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009747
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009748 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749}
9750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751Py_ssize_t
9752PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9753 Py_ssize_t start, Py_ssize_t end,
9754 int direction)
9755{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009757 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 if (PyUnicode_READY(str) == -1)
9759 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009760 len = PyUnicode_GET_LENGTH(str);
9761 ADJUST_INDICES(start, end, len);
9762 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009763 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009765 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9766 kind, end-start, ch, direction);
9767 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009769 else
9770 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771}
9772
Alexander Belopolsky40018472011-02-26 01:02:56 +00009773static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009774tailmatch(PyObject *self,
9775 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009776 Py_ssize_t start,
9777 Py_ssize_t end,
9778 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 int kind_self;
9781 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009782 const void *data_self;
9783 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 Py_ssize_t offset;
9785 Py_ssize_t i;
9786 Py_ssize_t end_sub;
9787
9788 if (PyUnicode_READY(self) == -1 ||
9789 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009790 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9793 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009795 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009797 if (PyUnicode_GET_LENGTH(substring) == 0)
9798 return 1;
9799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 kind_self = PyUnicode_KIND(self);
9801 data_self = PyUnicode_DATA(self);
9802 kind_sub = PyUnicode_KIND(substring);
9803 data_sub = PyUnicode_DATA(substring);
9804 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9805
9806 if (direction > 0)
9807 offset = end;
9808 else
9809 offset = start;
9810
9811 if (PyUnicode_READ(kind_self, data_self, offset) ==
9812 PyUnicode_READ(kind_sub, data_sub, 0) &&
9813 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9814 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9815 /* If both are of the same kind, memcmp is sufficient */
9816 if (kind_self == kind_sub) {
9817 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009818 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 data_sub,
9820 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009821 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009823 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 else {
9825 /* We do not need to compare 0 and len(substring)-1 because
9826 the if statement above ensured already that they are equal
9827 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 for (i = 1; i < end_sub; ++i) {
9829 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9830 PyUnicode_READ(kind_sub, data_sub, i))
9831 return 0;
9832 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835 }
9836
9837 return 0;
9838}
9839
Alexander Belopolsky40018472011-02-26 01:02:56 +00009840Py_ssize_t
9841PyUnicode_Tailmatch(PyObject *str,
9842 PyObject *substr,
9843 Py_ssize_t start,
9844 Py_ssize_t end,
9845 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009847 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009848 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009849
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009850 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851}
9852
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009853static PyObject *
9854ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009856 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009857 const char *data = PyUnicode_DATA(self);
9858 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009859 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009860
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009861 res = PyUnicode_New(len, 127);
9862 if (res == NULL)
9863 return NULL;
9864 resdata = PyUnicode_DATA(res);
9865 if (lower)
9866 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009868 _Py_bytes_upper(resdata, data, len);
9869 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870}
9871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009873handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009875 Py_ssize_t j;
9876 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009877 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009878 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009879
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009880 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9881
9882 where ! is a negation and \p{xxx} is a character with property xxx.
9883 */
9884 for (j = i - 1; j >= 0; j--) {
9885 c = PyUnicode_READ(kind, data, j);
9886 if (!_PyUnicode_IsCaseIgnorable(c))
9887 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009889 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9890 if (final_sigma) {
9891 for (j = i + 1; j < length; j++) {
9892 c = PyUnicode_READ(kind, data, j);
9893 if (!_PyUnicode_IsCaseIgnorable(c))
9894 break;
9895 }
9896 final_sigma = j == length || !_PyUnicode_IsCased(c);
9897 }
9898 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899}
9900
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009901static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009902lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009903 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009905 /* Obscure special case. */
9906 if (c == 0x3A3) {
9907 mapped[0] = handle_capital_sigma(kind, data, length, i);
9908 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009910 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911}
9912
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009913static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009914do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009915{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009916 Py_ssize_t i, k = 0;
9917 int n_res, j;
9918 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009919
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009920 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009921 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009922 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009923 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009924 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009926 for (i = 1; i < length; i++) {
9927 c = PyUnicode_READ(kind, data, i);
9928 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9929 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009930 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009931 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009932 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009933 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009934 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935}
9936
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009937static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009938do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009939 Py_ssize_t i, k = 0;
9940
9941 for (i = 0; i < length; i++) {
9942 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9943 int n_res, j;
9944 if (Py_UNICODE_ISUPPER(c)) {
9945 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9946 }
9947 else if (Py_UNICODE_ISLOWER(c)) {
9948 n_res = _PyUnicode_ToUpperFull(c, mapped);
9949 }
9950 else {
9951 n_res = 1;
9952 mapped[0] = c;
9953 }
9954 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009955 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009956 res[k++] = mapped[j];
9957 }
9958 }
9959 return k;
9960}
9961
9962static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009963do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009964 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009966 Py_ssize_t i, k = 0;
9967
9968 for (i = 0; i < length; i++) {
9969 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9970 int n_res, j;
9971 if (lower)
9972 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9973 else
9974 n_res = _PyUnicode_ToUpperFull(c, mapped);
9975 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009976 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009977 res[k++] = mapped[j];
9978 }
9979 }
9980 return k;
9981}
9982
9983static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009984do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009985{
9986 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9987}
9988
9989static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009990do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009991{
9992 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9993}
9994
Benjamin Petersone51757f2012-01-12 21:10:29 -05009995static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009996do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -05009997{
9998 Py_ssize_t i, k = 0;
9999
10000 for (i = 0; i < length; i++) {
10001 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10002 Py_UCS4 mapped[3];
10003 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10004 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010005 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010006 res[k++] = mapped[j];
10007 }
10008 }
10009 return k;
10010}
10011
10012static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010013do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010014{
10015 Py_ssize_t i, k = 0;
10016 int previous_is_cased;
10017
10018 previous_is_cased = 0;
10019 for (i = 0; i < length; i++) {
10020 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10021 Py_UCS4 mapped[3];
10022 int n_res, j;
10023
10024 if (previous_is_cased)
10025 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10026 else
10027 n_res = _PyUnicode_ToTitleFull(c, mapped);
10028
10029 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010030 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010031 res[k++] = mapped[j];
10032 }
10033
10034 previous_is_cased = _PyUnicode_IsCased(c);
10035 }
10036 return k;
10037}
10038
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010039static PyObject *
10040case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010041 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010042{
10043 PyObject *res = NULL;
10044 Py_ssize_t length, newlength = 0;
10045 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010046 const void *data;
10047 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010048 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10049
Benjamin Petersoneea48462012-01-16 14:28:50 -050010050 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010051
10052 kind = PyUnicode_KIND(self);
10053 data = PyUnicode_DATA(self);
10054 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010055 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010056 PyErr_SetString(PyExc_OverflowError, "string is too long");
10057 return NULL;
10058 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010059 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010060 if (tmp == NULL)
10061 return PyErr_NoMemory();
10062 newlength = perform(kind, data, length, tmp, &maxchar);
10063 res = PyUnicode_New(newlength, maxchar);
10064 if (res == NULL)
10065 goto leave;
10066 tmpend = tmp + newlength;
10067 outdata = PyUnicode_DATA(res);
10068 outkind = PyUnicode_KIND(res);
10069 switch (outkind) {
10070 case PyUnicode_1BYTE_KIND:
10071 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10072 break;
10073 case PyUnicode_2BYTE_KIND:
10074 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10075 break;
10076 case PyUnicode_4BYTE_KIND:
10077 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10078 break;
10079 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010080 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010081 }
10082 leave:
10083 PyMem_FREE(tmp);
10084 return res;
10085}
10086
Tim Peters8ce9f162004-08-27 01:49:32 +000010087PyObject *
10088PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010089{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010090 PyObject *res;
10091 PyObject *fseq;
10092 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010093 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010094
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010095 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010096 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010097 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010098 }
10099
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010100 /* NOTE: the following code can't call back into Python code,
10101 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010102 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010103
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010104 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010105 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010106 res = _PyUnicode_JoinArray(separator, items, seqlen);
10107 Py_DECREF(fseq);
10108 return res;
10109}
10110
10111PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010112_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010113{
10114 PyObject *res = NULL; /* the result */
10115 PyObject *sep = NULL;
10116 Py_ssize_t seplen;
10117 PyObject *item;
10118 Py_ssize_t sz, i, res_offset;
10119 Py_UCS4 maxchar;
10120 Py_UCS4 item_maxchar;
10121 int use_memcpy;
10122 unsigned char *res_data = NULL, *sep_data = NULL;
10123 PyObject *last_obj;
10124 unsigned int kind = 0;
10125
Tim Peters05eba1f2004-08-27 21:32:02 +000010126 /* If empty sequence, return u"". */
10127 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010128 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010129 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010130
Tim Peters05eba1f2004-08-27 21:32:02 +000010131 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010132 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010133 if (seqlen == 1) {
10134 if (PyUnicode_CheckExact(items[0])) {
10135 res = items[0];
10136 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010137 return res;
10138 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010139 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010140 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010141 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010142 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010143 /* Set up sep and seplen */
10144 if (separator == NULL) {
10145 /* fall back to a blank space separator */
10146 sep = PyUnicode_FromOrdinal(' ');
10147 if (!sep)
10148 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010149 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010150 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010151 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010152 else {
10153 if (!PyUnicode_Check(separator)) {
10154 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010155 "separator: expected str instance,"
10156 " %.80s found",
10157 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010158 goto onError;
10159 }
10160 if (PyUnicode_READY(separator))
10161 goto onError;
10162 sep = separator;
10163 seplen = PyUnicode_GET_LENGTH(separator);
10164 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10165 /* inc refcount to keep this code path symmetric with the
10166 above case of a blank separator */
10167 Py_INCREF(sep);
10168 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010169 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010170 }
10171
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010172 /* There are at least two things to join, or else we have a subclass
10173 * of str in the sequence.
10174 * Do a pre-pass to figure out the total amount of space we'll
10175 * need (sz), and see whether all argument are strings.
10176 */
10177 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010178#ifdef Py_DEBUG
10179 use_memcpy = 0;
10180#else
10181 use_memcpy = 1;
10182#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010183 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010184 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010185 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010186 if (!PyUnicode_Check(item)) {
10187 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010188 "sequence item %zd: expected str instance,"
10189 " %.80s found",
10190 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010191 goto onError;
10192 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 if (PyUnicode_READY(item) == -1)
10194 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010195 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010197 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010198 if (i != 0) {
10199 add_sz += seplen;
10200 }
10201 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010202 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010204 goto onError;
10205 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010206 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010207 if (use_memcpy && last_obj != NULL) {
10208 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10209 use_memcpy = 0;
10210 }
10211 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010212 }
Tim Petersced69f82003-09-16 20:30:58 +000010213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010215 if (res == NULL)
10216 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010217
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010218 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010219#ifdef Py_DEBUG
10220 use_memcpy = 0;
10221#else
10222 if (use_memcpy) {
10223 res_data = PyUnicode_1BYTE_DATA(res);
10224 kind = PyUnicode_KIND(res);
10225 if (seplen != 0)
10226 sep_data = PyUnicode_1BYTE_DATA(sep);
10227 }
10228#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010229 if (use_memcpy) {
10230 for (i = 0; i < seqlen; ++i) {
10231 Py_ssize_t itemlen;
10232 item = items[i];
10233
10234 /* Copy item, and maybe the separator. */
10235 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010236 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010237 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010238 kind * seplen);
10239 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010240 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010241
10242 itemlen = PyUnicode_GET_LENGTH(item);
10243 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010244 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010245 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 kind * itemlen);
10247 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010248 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010249 }
10250 assert(res_data == PyUnicode_1BYTE_DATA(res)
10251 + kind * PyUnicode_GET_LENGTH(res));
10252 }
10253 else {
10254 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10255 Py_ssize_t itemlen;
10256 item = items[i];
10257
10258 /* Copy item, and maybe the separator. */
10259 if (i && seplen != 0) {
10260 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10261 res_offset += seplen;
10262 }
10263
10264 itemlen = PyUnicode_GET_LENGTH(item);
10265 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010266 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010267 res_offset += itemlen;
10268 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010269 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010270 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010271 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010274 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276
Benjamin Peterson29060642009-01-31 22:14:21 +000010277 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010279 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280 return NULL;
10281}
10282
Victor Stinnerd3f08822012-05-29 12:57:52 +020010283void
10284_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10285 Py_UCS4 fill_char)
10286{
10287 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010288 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010289 assert(PyUnicode_IS_READY(unicode));
10290 assert(unicode_modifiable(unicode));
10291 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10292 assert(start >= 0);
10293 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010294 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010295}
10296
Victor Stinner3fe55312012-01-04 00:33:50 +010010297Py_ssize_t
10298PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10299 Py_UCS4 fill_char)
10300{
10301 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010302
10303 if (!PyUnicode_Check(unicode)) {
10304 PyErr_BadInternalCall();
10305 return -1;
10306 }
10307 if (PyUnicode_READY(unicode) == -1)
10308 return -1;
10309 if (unicode_check_modifiable(unicode))
10310 return -1;
10311
Victor Stinnerd3f08822012-05-29 12:57:52 +020010312 if (start < 0) {
10313 PyErr_SetString(PyExc_IndexError, "string index out of range");
10314 return -1;
10315 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010316 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10317 PyErr_SetString(PyExc_ValueError,
10318 "fill character is bigger than "
10319 "the string maximum character");
10320 return -1;
10321 }
10322
10323 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10324 length = Py_MIN(maxlen, length);
10325 if (length <= 0)
10326 return 0;
10327
Victor Stinnerd3f08822012-05-29 12:57:52 +020010328 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010329 return length;
10330}
10331
Victor Stinner9310abb2011-10-05 00:59:23 +020010332static PyObject *
10333pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010334 Py_ssize_t left,
10335 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 PyObject *u;
10339 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010340 int kind;
10341 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010342
10343 if (left < 0)
10344 left = 0;
10345 if (right < 0)
10346 right = 0;
10347
Victor Stinnerc4b49542011-12-11 22:44:26 +010010348 if (left == 0 && right == 0)
10349 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10352 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010353 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10354 return NULL;
10355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010357 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010359 if (!u)
10360 return NULL;
10361
10362 kind = PyUnicode_KIND(u);
10363 data = PyUnicode_DATA(u);
10364 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010365 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010366 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010367 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010368 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010369 assert(_PyUnicode_CheckConsistency(u, 1));
10370 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371}
10372
Alexander Belopolsky40018472011-02-26 01:02:56 +000010373PyObject *
10374PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010378 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380
Benjamin Petersonead6b532011-12-20 17:23:42 -060010381 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010383 if (PyUnicode_IS_ASCII(string))
10384 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010385 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010386 PyUnicode_GET_LENGTH(string), keepends);
10387 else
10388 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010389 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010390 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 break;
10392 case PyUnicode_2BYTE_KIND:
10393 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010394 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 PyUnicode_GET_LENGTH(string), keepends);
10396 break;
10397 case PyUnicode_4BYTE_KIND:
10398 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 PyUnicode_GET_LENGTH(string), keepends);
10401 break;
10402 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010403 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406}
10407
Alexander Belopolsky40018472011-02-26 01:02:56 +000010408static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010409split(PyObject *self,
10410 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010411 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010413 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010414 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 Py_ssize_t len1, len2;
10416 PyObject* out;
10417
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010419 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 if (PyUnicode_READY(self) == -1)
10422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010425 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010427 if (PyUnicode_IS_ASCII(self))
10428 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010429 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010430 PyUnicode_GET_LENGTH(self), maxcount
10431 );
10432 else
10433 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010434 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010435 PyUnicode_GET_LENGTH(self), maxcount
10436 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 case PyUnicode_2BYTE_KIND:
10438 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010439 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 PyUnicode_GET_LENGTH(self), maxcount
10441 );
10442 case PyUnicode_4BYTE_KIND:
10443 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010444 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 PyUnicode_GET_LENGTH(self), maxcount
10446 );
10447 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010448 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 }
10450
10451 if (PyUnicode_READY(substring) == -1)
10452 return NULL;
10453
10454 kind1 = PyUnicode_KIND(self);
10455 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 len1 = PyUnicode_GET_LENGTH(self);
10457 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010458 if (kind1 < kind2 || len1 < len2) {
10459 out = PyList_New(1);
10460 if (out == NULL)
10461 return NULL;
10462 Py_INCREF(self);
10463 PyList_SET_ITEM(out, 0, self);
10464 return out;
10465 }
10466 buf1 = PyUnicode_DATA(self);
10467 buf2 = PyUnicode_DATA(substring);
10468 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010469 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010470 if (!buf2)
10471 return NULL;
10472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010474 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010476 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10477 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010478 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010479 else
10480 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010481 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 break;
10483 case PyUnicode_2BYTE_KIND:
10484 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010485 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 break;
10487 case PyUnicode_4BYTE_KIND:
10488 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010489 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 break;
10491 default:
10492 out = NULL;
10493 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010494 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010495 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010496 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498}
10499
Alexander Belopolsky40018472011-02-26 01:02:56 +000010500static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010501rsplit(PyObject *self,
10502 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010503 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010504{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010505 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010506 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 Py_ssize_t len1, len2;
10508 PyObject* out;
10509
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010510 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010511 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (PyUnicode_READY(self) == -1)
10514 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010517 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010519 if (PyUnicode_IS_ASCII(self))
10520 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010521 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010522 PyUnicode_GET_LENGTH(self), maxcount
10523 );
10524 else
10525 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010526 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010527 PyUnicode_GET_LENGTH(self), maxcount
10528 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 case PyUnicode_2BYTE_KIND:
10530 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010531 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 PyUnicode_GET_LENGTH(self), maxcount
10533 );
10534 case PyUnicode_4BYTE_KIND:
10535 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010536 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 PyUnicode_GET_LENGTH(self), maxcount
10538 );
10539 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010540 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 }
10542
10543 if (PyUnicode_READY(substring) == -1)
10544 return NULL;
10545
10546 kind1 = PyUnicode_KIND(self);
10547 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 len1 = PyUnicode_GET_LENGTH(self);
10549 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010550 if (kind1 < kind2 || len1 < len2) {
10551 out = PyList_New(1);
10552 if (out == NULL)
10553 return NULL;
10554 Py_INCREF(self);
10555 PyList_SET_ITEM(out, 0, self);
10556 return out;
10557 }
10558 buf1 = PyUnicode_DATA(self);
10559 buf2 = PyUnicode_DATA(substring);
10560 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010561 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010562 if (!buf2)
10563 return NULL;
10564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010566 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010568 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10569 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010570 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010571 else
10572 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010573 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 break;
10575 case PyUnicode_2BYTE_KIND:
10576 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010577 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 break;
10579 case PyUnicode_4BYTE_KIND:
10580 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010581 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 break;
10583 default:
10584 out = NULL;
10585 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010586 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010587 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010588 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 return out;
10590}
10591
10592static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010593anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10594 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010596 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010598 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10599 return asciilib_find(buf1, len1, buf2, len2, offset);
10600 else
10601 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 case PyUnicode_2BYTE_KIND:
10603 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10604 case PyUnicode_4BYTE_KIND:
10605 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10606 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010607 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608}
10609
10610static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010611anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10612 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010614 switch (kind) {
10615 case PyUnicode_1BYTE_KIND:
10616 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10617 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10618 else
10619 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10620 case PyUnicode_2BYTE_KIND:
10621 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10622 case PyUnicode_4BYTE_KIND:
10623 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10624 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010625 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010626}
10627
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010628static void
10629replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10630 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10631{
10632 int kind = PyUnicode_KIND(u);
10633 void *data = PyUnicode_DATA(u);
10634 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10635 if (kind == PyUnicode_1BYTE_KIND) {
10636 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10637 (Py_UCS1 *)data + len,
10638 u1, u2, maxcount);
10639 }
10640 else if (kind == PyUnicode_2BYTE_KIND) {
10641 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10642 (Py_UCS2 *)data + len,
10643 u1, u2, maxcount);
10644 }
10645 else {
10646 assert(kind == PyUnicode_4BYTE_KIND);
10647 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10648 (Py_UCS4 *)data + len,
10649 u1, u2, maxcount);
10650 }
10651}
10652
Alexander Belopolsky40018472011-02-26 01:02:56 +000010653static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654replace(PyObject *self, PyObject *str1,
10655 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010658 const char *sbuf = PyUnicode_DATA(self);
10659 const void *buf1 = PyUnicode_DATA(str1);
10660 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 int srelease = 0, release1 = 0, release2 = 0;
10662 int skind = PyUnicode_KIND(self);
10663 int kind1 = PyUnicode_KIND(str1);
10664 int kind2 = PyUnicode_KIND(str2);
10665 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10666 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10667 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010668 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010669 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010671 if (slen < len1)
10672 goto nothing;
10673
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010675 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010676 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010677 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678
Victor Stinner59de0ee2011-10-07 10:01:28 +020010679 if (str1 == str2)
10680 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681
Victor Stinner49a0a212011-10-12 23:46:10 +020010682 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010683 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10684 if (maxchar < maxchar_str1)
10685 /* substring too wide to be present */
10686 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10688 /* Replacing str1 with str2 may cause a maxchar reduction in the
10689 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010690 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010691 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010696 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010698 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010699 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010700 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010701
Victor Stinner69ed0f42013-04-09 21:48:24 +020010702 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010703 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010704 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010705 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010706 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010708 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010710
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010711 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10712 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010713 }
10714 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 int rkind = skind;
10716 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010717 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (kind1 < rkind) {
10720 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010721 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 if (!buf1) goto error;
10723 release1 = 1;
10724 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010725 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010726 if (i < 0)
10727 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (rkind > kind2) {
10729 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010730 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 if (!buf2) goto error;
10732 release2 = 1;
10733 }
10734 else if (rkind < kind2) {
10735 /* widen self and buf1 */
10736 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010737 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010738 assert(buf1 != PyUnicode_DATA(str1));
10739 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010740 buf1 = PyUnicode_DATA(str1);
10741 release1 = 0;
10742 }
10743 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 if (!sbuf) goto error;
10745 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010746 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 if (!buf1) goto error;
10748 release1 = 1;
10749 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010750 u = PyUnicode_New(slen, maxchar);
10751 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010753 assert(PyUnicode_KIND(u) == rkind);
10754 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010755
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010756 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010757 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010758 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010760 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010762
10763 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010764 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010765 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010766 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010767 if (i == -1)
10768 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010769 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010771 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010773 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010775 }
10776 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010778 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 int rkind = skind;
10780 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010783 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010784 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 if (!buf1) goto error;
10786 release1 = 1;
10787 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010788 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010789 if (n == 0)
10790 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010792 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010793 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 if (!buf2) goto error;
10795 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010798 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010800 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 if (!sbuf) goto error;
10802 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010803 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010804 assert(buf1 != PyUnicode_DATA(str1));
10805 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010806 buf1 = PyUnicode_DATA(str1);
10807 release1 = 0;
10808 }
10809 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 if (!buf1) goto error;
10811 release1 = 1;
10812 }
10813 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10814 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010815 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 PyErr_SetString(PyExc_OverflowError,
10817 "replace string is too long");
10818 goto error;
10819 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010820 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010821 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010822 _Py_INCREF_UNICODE_EMPTY();
10823 if (!unicode_empty)
10824 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010825 u = unicode_empty;
10826 goto done;
10827 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010828 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 PyErr_SetString(PyExc_OverflowError,
10830 "replace string is too long");
10831 goto error;
10832 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010833 u = PyUnicode_New(new_size, maxchar);
10834 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010836 assert(PyUnicode_KIND(u) == rkind);
10837 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 ires = i = 0;
10839 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010840 while (n-- > 0) {
10841 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010842 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010843 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010844 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010845 if (j == -1)
10846 break;
10847 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010848 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010849 memcpy(res + rkind * ires,
10850 sbuf + rkind * i,
10851 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010853 }
10854 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010856 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010858 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010864 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010865 memcpy(res + rkind * ires,
10866 sbuf + rkind * i,
10867 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010868 }
10869 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010870 /* interleave */
10871 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010872 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010874 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010876 if (--n <= 0)
10877 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010878 memcpy(res + rkind * ires,
10879 sbuf + rkind * i,
10880 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 ires++;
10882 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010883 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010884 memcpy(res + rkind * ires,
10885 sbuf + rkind * i,
10886 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010887 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010888 }
10889
10890 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010891 unicode_adjust_maxchar(&u);
10892 if (u == NULL)
10893 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010895
10896 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010897 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10898 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10899 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010901 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010903 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010905 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010906 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010908
Benjamin Peterson29060642009-01-31 22:14:21 +000010909 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010910 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010911 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10912 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10913 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010915 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010917 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010919 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010920 return unicode_result_unchanged(self);
10921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010923 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10924 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10925 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10926 if (srelease)
10927 PyMem_FREE((void *)sbuf);
10928 if (release1)
10929 PyMem_FREE((void *)buf1);
10930 if (release2)
10931 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933}
10934
10935/* --- Unicode Object Methods --------------------------------------------- */
10936
INADA Naoki3ae20562017-01-16 20:41:20 +090010937/*[clinic input]
10938str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939
INADA Naoki3ae20562017-01-16 20:41:20 +090010940Return a version of the string where each word is titlecased.
10941
10942More specifically, words start with uppercased characters and all remaining
10943cased characters have lower case.
10944[clinic start generated code]*/
10945
10946static PyObject *
10947unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010948/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010950 if (PyUnicode_READY(self) == -1)
10951 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010952 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953}
10954
INADA Naoki3ae20562017-01-16 20:41:20 +090010955/*[clinic input]
10956str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957
INADA Naoki3ae20562017-01-16 20:41:20 +090010958Return a capitalized version of the string.
10959
10960More specifically, make the first character have upper case and the rest lower
10961case.
10962[clinic start generated code]*/
10963
10964static PyObject *
10965unicode_capitalize_impl(PyObject *self)
10966/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010968 if (PyUnicode_READY(self) == -1)
10969 return NULL;
10970 if (PyUnicode_GET_LENGTH(self) == 0)
10971 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010972 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973}
10974
INADA Naoki3ae20562017-01-16 20:41:20 +090010975/*[clinic input]
10976str.casefold as unicode_casefold
10977
10978Return a version of the string suitable for caseless comparisons.
10979[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010980
10981static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010982unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010983/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010984{
10985 if (PyUnicode_READY(self) == -1)
10986 return NULL;
10987 if (PyUnicode_IS_ASCII(self))
10988 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010989 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010990}
10991
10992
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010993/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010994
10995static int
10996convert_uc(PyObject *obj, void *addr)
10997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010999
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011000 if (!PyUnicode_Check(obj)) {
11001 PyErr_Format(PyExc_TypeError,
11002 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011003 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011004 return 0;
11005 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011006 if (PyUnicode_READY(obj) < 0)
11007 return 0;
11008 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011009 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011011 return 0;
11012 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011013 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011014 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011015}
11016
INADA Naoki3ae20562017-01-16 20:41:20 +090011017/*[clinic input]
11018str.center as unicode_center
11019
11020 width: Py_ssize_t
11021 fillchar: Py_UCS4 = ' '
11022 /
11023
11024Return a centered string of length width.
11025
11026Padding is done using the specified fill character (default is a space).
11027[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028
11029static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011030unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11031/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011033 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034
Benjamin Petersonbac79492012-01-14 13:34:47 -050011035 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 return NULL;
11037
Victor Stinnerc4b49542011-12-11 22:44:26 +010011038 if (PyUnicode_GET_LENGTH(self) >= width)
11039 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040
Victor Stinnerc4b49542011-12-11 22:44:26 +010011041 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042 left = marg / 2 + (marg & width & 1);
11043
Victor Stinner9310abb2011-10-05 00:59:23 +020011044 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045}
11046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047/* This function assumes that str1 and str2 are readied by the caller. */
11048
Marc-André Lemburge5034372000-08-08 08:04:29 +000011049static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011050unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011051{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011052#define COMPARE(TYPE1, TYPE2) \
11053 do { \
11054 TYPE1* p1 = (TYPE1 *)data1; \
11055 TYPE2* p2 = (TYPE2 *)data2; \
11056 TYPE1* end = p1 + len; \
11057 Py_UCS4 c1, c2; \
11058 for (; p1 != end; p1++, p2++) { \
11059 c1 = *p1; \
11060 c2 = *p2; \
11061 if (c1 != c2) \
11062 return (c1 < c2) ? -1 : 1; \
11063 } \
11064 } \
11065 while (0)
11066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011068 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011069 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 kind1 = PyUnicode_KIND(str1);
11072 kind2 = PyUnicode_KIND(str2);
11073 data1 = PyUnicode_DATA(str1);
11074 data2 = PyUnicode_DATA(str2);
11075 len1 = PyUnicode_GET_LENGTH(str1);
11076 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011077 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011078
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011079 switch(kind1) {
11080 case PyUnicode_1BYTE_KIND:
11081 {
11082 switch(kind2) {
11083 case PyUnicode_1BYTE_KIND:
11084 {
11085 int cmp = memcmp(data1, data2, len);
11086 /* normalize result of memcmp() into the range [-1; 1] */
11087 if (cmp < 0)
11088 return -1;
11089 if (cmp > 0)
11090 return 1;
11091 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011092 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011093 case PyUnicode_2BYTE_KIND:
11094 COMPARE(Py_UCS1, Py_UCS2);
11095 break;
11096 case PyUnicode_4BYTE_KIND:
11097 COMPARE(Py_UCS1, Py_UCS4);
11098 break;
11099 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011100 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011101 }
11102 break;
11103 }
11104 case PyUnicode_2BYTE_KIND:
11105 {
11106 switch(kind2) {
11107 case PyUnicode_1BYTE_KIND:
11108 COMPARE(Py_UCS2, Py_UCS1);
11109 break;
11110 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011111 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011112 COMPARE(Py_UCS2, Py_UCS2);
11113 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011114 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011115 case PyUnicode_4BYTE_KIND:
11116 COMPARE(Py_UCS2, Py_UCS4);
11117 break;
11118 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011119 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011120 }
11121 break;
11122 }
11123 case PyUnicode_4BYTE_KIND:
11124 {
11125 switch(kind2) {
11126 case PyUnicode_1BYTE_KIND:
11127 COMPARE(Py_UCS4, Py_UCS1);
11128 break;
11129 case PyUnicode_2BYTE_KIND:
11130 COMPARE(Py_UCS4, Py_UCS2);
11131 break;
11132 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011133 {
11134#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11135 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11136 /* normalize result of wmemcmp() into the range [-1; 1] */
11137 if (cmp < 0)
11138 return -1;
11139 if (cmp > 0)
11140 return 1;
11141#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011142 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011143#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011144 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011145 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011146 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011147 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011148 }
11149 break;
11150 }
11151 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011152 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011153 }
11154
Victor Stinner770e19e2012-10-04 22:59:45 +020011155 if (len1 == len2)
11156 return 0;
11157 if (len1 < len2)
11158 return -1;
11159 else
11160 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011161
11162#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011163}
11164
Benjamin Peterson621b4302016-09-09 13:54:34 -070011165static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011166unicode_compare_eq(PyObject *str1, PyObject *str2)
11167{
11168 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011169 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011170 Py_ssize_t len;
11171 int cmp;
11172
Victor Stinnere5567ad2012-10-23 02:48:49 +020011173 len = PyUnicode_GET_LENGTH(str1);
11174 if (PyUnicode_GET_LENGTH(str2) != len)
11175 return 0;
11176 kind = PyUnicode_KIND(str1);
11177 if (PyUnicode_KIND(str2) != kind)
11178 return 0;
11179 data1 = PyUnicode_DATA(str1);
11180 data2 = PyUnicode_DATA(str2);
11181
11182 cmp = memcmp(data1, data2, len * kind);
11183 return (cmp == 0);
11184}
11185
11186
Alexander Belopolsky40018472011-02-26 01:02:56 +000011187int
11188PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11191 if (PyUnicode_READY(left) == -1 ||
11192 PyUnicode_READY(right) == -1)
11193 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011194
11195 /* a string is equal to itself */
11196 if (left == right)
11197 return 0;
11198
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011199 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011201 PyErr_Format(PyExc_TypeError,
11202 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011203 Py_TYPE(left)->tp_name,
11204 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 return -1;
11206}
11207
Martin v. Löwis5b222132007-06-10 09:51:05 +000011208int
11209PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 Py_ssize_t i;
11212 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011214 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215
Victor Stinner910337b2011-10-03 03:20:16 +020011216 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011217 if (!PyUnicode_IS_READY(uni)) {
11218 const wchar_t *ws = _PyUnicode_WSTR(uni);
11219 /* Compare Unicode string and source character set string */
11220 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11221 if (chr != ustr[i])
11222 return (chr < ustr[i]) ? -1 : 1;
11223 }
11224 /* This check keeps Python strings that end in '\0' from comparing equal
11225 to C strings identical up to that point. */
11226 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11227 return 1; /* uni is longer */
11228 if (ustr[i])
11229 return -1; /* str is longer */
11230 return 0;
11231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011233 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011234 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011235 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011236 size_t len, len2 = strlen(str);
11237 int cmp;
11238
11239 len = Py_MIN(len1, len2);
11240 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011241 if (cmp != 0) {
11242 if (cmp < 0)
11243 return -1;
11244 else
11245 return 1;
11246 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011247 if (len1 > len2)
11248 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011249 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011250 return -1; /* str is longer */
11251 return 0;
11252 }
11253 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011254 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011255 /* Compare Unicode string and source character set string */
11256 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011257 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011258 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11259 /* This check keeps Python strings that end in '\0' from comparing equal
11260 to C strings identical up to that point. */
11261 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11262 return 1; /* uni is longer */
11263 if (str[i])
11264 return -1; /* str is longer */
11265 return 0;
11266 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011267}
11268
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011269static int
11270non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11271{
11272 size_t i, len;
11273 const wchar_t *p;
11274 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11275 if (strlen(str) != len)
11276 return 0;
11277 p = _PyUnicode_WSTR(unicode);
11278 assert(p);
11279 for (i = 0; i < len; i++) {
11280 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011281 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011282 return 0;
11283 }
11284 return 1;
11285}
11286
11287int
11288_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11289{
11290 size_t len;
11291 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011292 assert(str);
11293#ifndef NDEBUG
11294 for (const char *p = str; *p; p++) {
11295 assert((unsigned char)*p < 128);
11296 }
11297#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011298 if (PyUnicode_READY(unicode) == -1) {
11299 /* Memory error or bad data */
11300 PyErr_Clear();
11301 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11302 }
11303 if (!PyUnicode_IS_ASCII(unicode))
11304 return 0;
11305 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11306 return strlen(str) == len &&
11307 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11308}
11309
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011310int
11311_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11312{
11313 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011314
11315 assert(_PyUnicode_CHECK(left));
11316 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011317#ifndef NDEBUG
11318 for (const char *p = right->string; *p; p++) {
11319 assert((unsigned char)*p < 128);
11320 }
11321#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011322
11323 if (PyUnicode_READY(left) == -1) {
11324 /* memory error or bad data */
11325 PyErr_Clear();
11326 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11327 }
11328
11329 if (!PyUnicode_IS_ASCII(left))
11330 return 0;
11331
11332 right_uni = _PyUnicode_FromId(right); /* borrowed */
11333 if (right_uni == NULL) {
11334 /* memory error or bad data */
11335 PyErr_Clear();
11336 return _PyUnicode_EqualToASCIIString(left, right->string);
11337 }
11338
11339 if (left == right_uni)
11340 return 1;
11341
11342 if (PyUnicode_CHECK_INTERNED(left))
11343 return 0;
11344
Victor Stinner607b1022020-05-05 18:50:30 +020011345#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011346 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011347 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011348 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11349 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011350#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011351
11352 return unicode_compare_eq(left, right_uni);
11353}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011354
Alexander Belopolsky40018472011-02-26 01:02:56 +000011355PyObject *
11356PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011357{
11358 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011359
Victor Stinnere5567ad2012-10-23 02:48:49 +020011360 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11361 Py_RETURN_NOTIMPLEMENTED;
11362
11363 if (PyUnicode_READY(left) == -1 ||
11364 PyUnicode_READY(right) == -1)
11365 return NULL;
11366
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011367 if (left == right) {
11368 switch (op) {
11369 case Py_EQ:
11370 case Py_LE:
11371 case Py_GE:
11372 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011373 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011374 case Py_NE:
11375 case Py_LT:
11376 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011377 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011378 default:
11379 PyErr_BadArgument();
11380 return NULL;
11381 }
11382 }
11383 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011384 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011385 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011386 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011387 }
11388 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011389 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011390 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011391 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011392}
11393
Alexander Belopolsky40018472011-02-26 01:02:56 +000011394int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011395_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11396{
11397 return unicode_eq(aa, bb);
11398}
11399
11400int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011401PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011402{
Victor Stinner77282cb2013-04-14 19:22:47 +020011403 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011404 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011406 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011407
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011408 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011409 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011410 "'in <string>' requires string as left operand, not %.100s",
11411 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011412 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011413 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011414 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011415 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011416 if (ensure_unicode(str) < 0)
11417 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011420 kind2 = PyUnicode_KIND(substr);
11421 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011422 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011424 len2 = PyUnicode_GET_LENGTH(substr);
11425 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011426 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011427 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011428 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011429 if (len2 == 1) {
11430 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11431 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011432 return result;
11433 }
11434 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011435 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011436 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011437 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439
Victor Stinner77282cb2013-04-14 19:22:47 +020011440 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 case PyUnicode_1BYTE_KIND:
11442 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11443 break;
11444 case PyUnicode_2BYTE_KIND:
11445 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11446 break;
11447 case PyUnicode_4BYTE_KIND:
11448 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11449 break;
11450 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011451 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011453
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011454 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011455 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011456 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457
Guido van Rossum403d68b2000-03-13 15:55:09 +000011458 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011459}
11460
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461/* Concat to string or Unicode object giving a new Unicode object. */
11462
Alexander Belopolsky40018472011-02-26 01:02:56 +000011463PyObject *
11464PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011466 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011467 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011468 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011470 if (ensure_unicode(left) < 0)
11471 return NULL;
11472
11473 if (!PyUnicode_Check(right)) {
11474 PyErr_Format(PyExc_TypeError,
11475 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011476 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011477 return NULL;
11478 }
11479 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011480 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
11482 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011483 if (left == unicode_empty)
11484 return PyUnicode_FromObject(right);
11485 if (right == unicode_empty)
11486 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011488 left_len = PyUnicode_GET_LENGTH(left);
11489 right_len = PyUnicode_GET_LENGTH(right);
11490 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011491 PyErr_SetString(PyExc_OverflowError,
11492 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011493 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011494 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011495 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011496
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011497 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11498 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011499 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011502 result = PyUnicode_New(new_len, maxchar);
11503 if (result == NULL)
11504 return NULL;
11505 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11506 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11507 assert(_PyUnicode_CheckConsistency(result, 1));
11508 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509}
11510
Walter Dörwald1ab83302007-05-18 17:15:44 +000011511void
Victor Stinner23e56682011-10-03 03:54:37 +020011512PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011513{
Victor Stinner23e56682011-10-03 03:54:37 +020011514 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011515 Py_UCS4 maxchar, maxchar2;
11516 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011517
11518 if (p_left == NULL) {
11519 if (!PyErr_Occurred())
11520 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011521 return;
11522 }
Victor Stinner23e56682011-10-03 03:54:37 +020011523 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011524 if (right == NULL || left == NULL
11525 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011526 if (!PyErr_Occurred())
11527 PyErr_BadInternalCall();
11528 goto error;
11529 }
11530
Benjamin Petersonbac79492012-01-14 13:34:47 -050011531 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011532 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011533 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011534 goto error;
11535
Victor Stinner488fa492011-12-12 00:01:39 +010011536 /* Shortcuts */
11537 if (left == unicode_empty) {
11538 Py_DECREF(left);
11539 Py_INCREF(right);
11540 *p_left = right;
11541 return;
11542 }
11543 if (right == unicode_empty)
11544 return;
11545
11546 left_len = PyUnicode_GET_LENGTH(left);
11547 right_len = PyUnicode_GET_LENGTH(right);
11548 if (left_len > PY_SSIZE_T_MAX - right_len) {
11549 PyErr_SetString(PyExc_OverflowError,
11550 "strings are too large to concat");
11551 goto error;
11552 }
11553 new_len = left_len + right_len;
11554
11555 if (unicode_modifiable(left)
11556 && PyUnicode_CheckExact(right)
11557 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011558 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11559 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011560 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011561 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011562 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11563 {
11564 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011565 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011566 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011567
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011568 /* copy 'right' into the newly allocated area of 'left' */
11569 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011570 }
Victor Stinner488fa492011-12-12 00:01:39 +010011571 else {
11572 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11573 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011574 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011575
Victor Stinner488fa492011-12-12 00:01:39 +010011576 /* Concat the two Unicode strings */
11577 res = PyUnicode_New(new_len, maxchar);
11578 if (res == NULL)
11579 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011580 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11581 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011582 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011583 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011584 }
11585 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011586 return;
11587
11588error:
Victor Stinner488fa492011-12-12 00:01:39 +010011589 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011590}
11591
11592void
11593PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11594{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011595 PyUnicode_Append(pleft, right);
11596 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011597}
11598
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011599/*
11600Wraps stringlib_parse_args_finds() and additionally ensures that the
11601first argument is a unicode object.
11602*/
11603
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011604static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011605parse_args_finds_unicode(const char * function_name, PyObject *args,
11606 PyObject **substring,
11607 Py_ssize_t *start, Py_ssize_t *end)
11608{
11609 if(stringlib_parse_args_finds(function_name, args, substring,
11610 start, end)) {
11611 if (ensure_unicode(*substring) < 0)
11612 return 0;
11613 return 1;
11614 }
11615 return 0;
11616}
11617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011618PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011621Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011622string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
11625static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011626unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011628 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011629 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011630 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011632 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011633 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011636 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 kind1 = PyUnicode_KIND(self);
11640 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011641 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011642 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 len1 = PyUnicode_GET_LENGTH(self);
11645 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011647 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011648 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011649
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011650 buf1 = PyUnicode_DATA(self);
11651 buf2 = PyUnicode_DATA(substring);
11652 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011653 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011654 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011655 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011656 }
11657 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 case PyUnicode_1BYTE_KIND:
11659 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011660 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 buf2, len2, PY_SSIZE_T_MAX
11662 );
11663 break;
11664 case PyUnicode_2BYTE_KIND:
11665 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011666 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 buf2, len2, PY_SSIZE_T_MAX
11668 );
11669 break;
11670 case PyUnicode_4BYTE_KIND:
11671 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011672 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 buf2, len2, PY_SSIZE_T_MAX
11674 );
11675 break;
11676 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011677 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 }
11679
11680 result = PyLong_FromSsize_t(iresult);
11681
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011682 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011683 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011684 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686 return result;
11687}
11688
INADA Naoki3ae20562017-01-16 20:41:20 +090011689/*[clinic input]
11690str.encode as unicode_encode
11691
11692 encoding: str(c_default="NULL") = 'utf-8'
11693 The encoding in which to encode the string.
11694 errors: str(c_default="NULL") = 'strict'
11695 The error handling scheme to use for encoding errors.
11696 The default is 'strict' meaning that encoding errors raise a
11697 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11698 'xmlcharrefreplace' as well as any other name registered with
11699 codecs.register_error that can handle UnicodeEncodeErrors.
11700
11701Encode the string using the codec registered for encoding.
11702[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703
11704static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011705unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011706/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011708 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011709}
11710
INADA Naoki3ae20562017-01-16 20:41:20 +090011711/*[clinic input]
11712str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713
INADA Naoki3ae20562017-01-16 20:41:20 +090011714 tabsize: int = 8
11715
11716Return a copy where all tab characters are expanded using spaces.
11717
11718If tabsize is not given, a tab size of 8 characters is assumed.
11719[clinic start generated code]*/
11720
11721static PyObject *
11722unicode_expandtabs_impl(PyObject *self, int tabsize)
11723/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011725 Py_ssize_t i, j, line_pos, src_len, incr;
11726 Py_UCS4 ch;
11727 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011728 const void *src_data;
11729 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011730 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011731 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732
Antoine Pitrou22425222011-10-04 19:10:51 +020011733 if (PyUnicode_READY(self) == -1)
11734 return NULL;
11735
Thomas Wouters7e474022000-07-16 12:04:32 +000011736 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011737 src_len = PyUnicode_GET_LENGTH(self);
11738 i = j = line_pos = 0;
11739 kind = PyUnicode_KIND(self);
11740 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011741 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011742 for (; i < src_len; i++) {
11743 ch = PyUnicode_READ(kind, src_data, i);
11744 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011745 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011747 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011748 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011749 goto overflow;
11750 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011752 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011756 goto overflow;
11757 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011759 if (ch == '\n' || ch == '\r')
11760 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011762 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011763 if (!found)
11764 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011765
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011767 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 if (!u)
11769 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011770 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771
Antoine Pitroue71d5742011-10-04 15:55:09 +020011772 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
Antoine Pitroue71d5742011-10-04 15:55:09 +020011774 for (; i < src_len; i++) {
11775 ch = PyUnicode_READ(kind, src_data, i);
11776 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011777 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011778 incr = tabsize - (line_pos % tabsize);
11779 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011780 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011781 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011783 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011784 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011785 line_pos++;
11786 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011787 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011788 if (ch == '\n' || ch == '\r')
11789 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011791 }
11792 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011793 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011794
Antoine Pitroue71d5742011-10-04 15:55:09 +020011795 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011796 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798}
11799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011800PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802\n\
11803Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011804such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805arguments start and end are interpreted as in slice notation.\n\
11806\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011807Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
11809static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011812 /* initialize variables to prevent gcc warning */
11813 PyObject *substring = NULL;
11814 Py_ssize_t start = 0;
11815 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011816 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011818 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011821 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011824 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 if (result == -2)
11827 return NULL;
11828
Christian Heimes217cfd12007-12-02 14:31:20 +000011829 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830}
11831
11832static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011833unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011835 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011836 enum PyUnicode_Kind kind;
11837 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011838
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011839 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011840 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011842 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011843 if (PyUnicode_READY(self) == -1) {
11844 return NULL;
11845 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011846 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11847 PyErr_SetString(PyExc_IndexError, "string index out of range");
11848 return NULL;
11849 }
11850 kind = PyUnicode_KIND(self);
11851 data = PyUnicode_DATA(self);
11852 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011853 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854}
11855
Guido van Rossumc2504932007-09-18 19:42:40 +000011856/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011857 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011858static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011859unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011861 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011862
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011863#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011864 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011865#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 if (_PyUnicode_HASH(self) != -1)
11867 return _PyUnicode_HASH(self);
11868 if (PyUnicode_READY(self) == -1)
11869 return -1;
animalizea1d14252019-01-02 20:16:06 +080011870
Christian Heimes985ecdc2013-11-20 11:46:18 +010011871 x = _Py_HashBytes(PyUnicode_DATA(self),
11872 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011874 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875}
11876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011877PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879\n\
oldkaa0735f2018-02-02 16:52:55 +080011880Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011881such that sub is contained within S[start:end]. Optional\n\
11882arguments start and end are interpreted as in slice notation.\n\
11883\n\
11884Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
11886static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011889 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011890 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011891 PyObject *substring = NULL;
11892 Py_ssize_t start = 0;
11893 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011895 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011898 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011901 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 if (result == -2)
11904 return NULL;
11905
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906 if (result < 0) {
11907 PyErr_SetString(PyExc_ValueError, "substring not found");
11908 return NULL;
11909 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011910
Christian Heimes217cfd12007-12-02 14:31:20 +000011911 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912}
11913
INADA Naoki3ae20562017-01-16 20:41:20 +090011914/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011915str.isascii as unicode_isascii
11916
11917Return True if all characters in the string are ASCII, False otherwise.
11918
11919ASCII characters have code points in the range U+0000-U+007F.
11920Empty string is ASCII too.
11921[clinic start generated code]*/
11922
11923static PyObject *
11924unicode_isascii_impl(PyObject *self)
11925/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11926{
11927 if (PyUnicode_READY(self) == -1) {
11928 return NULL;
11929 }
11930 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11931}
11932
11933/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011934str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
INADA Naoki3ae20562017-01-16 20:41:20 +090011936Return True if the string is a lowercase string, False otherwise.
11937
11938A string is lowercase if all cased characters in the string are lowercase and
11939there is at least one cased character in the string.
11940[clinic start generated code]*/
11941
11942static PyObject *
11943unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011944/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 Py_ssize_t i, length;
11947 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011948 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 int cased;
11950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 if (PyUnicode_READY(self) == -1)
11952 return NULL;
11953 length = PyUnicode_GET_LENGTH(self);
11954 kind = PyUnicode_KIND(self);
11955 data = PyUnicode_DATA(self);
11956
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 if (length == 1)
11959 return PyBool_FromLong(
11960 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011962 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011964 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011965
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 for (i = 0; i < length; i++) {
11968 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011969
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011971 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 else if (!cased && Py_UNICODE_ISLOWER(ch))
11973 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011975 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976}
11977
INADA Naoki3ae20562017-01-16 20:41:20 +090011978/*[clinic input]
11979str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980
INADA Naoki3ae20562017-01-16 20:41:20 +090011981Return True if the string is an uppercase string, False otherwise.
11982
11983A string is uppercase if all cased characters in the string are uppercase and
11984there is at least one cased character in the string.
11985[clinic start generated code]*/
11986
11987static PyObject *
11988unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011989/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 Py_ssize_t i, length;
11992 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011993 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994 int cased;
11995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (PyUnicode_READY(self) == -1)
11997 return NULL;
11998 length = PyUnicode_GET_LENGTH(self);
11999 kind = PyUnicode_KIND(self);
12000 data = PyUnicode_DATA(self);
12001
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (length == 1)
12004 return PyBool_FromLong(
12005 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012007 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012009 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012010
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 for (i = 0; i < length; i++) {
12013 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012014
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012016 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012017 else if (!cased && Py_UNICODE_ISUPPER(ch))
12018 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012020 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021}
12022
INADA Naoki3ae20562017-01-16 20:41:20 +090012023/*[clinic input]
12024str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025
INADA Naoki3ae20562017-01-16 20:41:20 +090012026Return True if the string is a title-cased string, False otherwise.
12027
12028In a title-cased string, upper- and title-case characters may only
12029follow uncased characters and lowercase characters only cased ones.
12030[clinic start generated code]*/
12031
12032static PyObject *
12033unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012034/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 Py_ssize_t i, length;
12037 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012038 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039 int cased, previous_is_cased;
12040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 if (PyUnicode_READY(self) == -1)
12042 return NULL;
12043 length = PyUnicode_GET_LENGTH(self);
12044 kind = PyUnicode_KIND(self);
12045 data = PyUnicode_DATA(self);
12046
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (length == 1) {
12049 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12050 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12051 (Py_UNICODE_ISUPPER(ch) != 0));
12052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012054 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012056 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012057
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058 cased = 0;
12059 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 for (i = 0; i < length; i++) {
12061 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012062
Benjamin Peterson29060642009-01-31 22:14:21 +000012063 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12064 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012065 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012066 previous_is_cased = 1;
12067 cased = 1;
12068 }
12069 else if (Py_UNICODE_ISLOWER(ch)) {
12070 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012071 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012072 previous_is_cased = 1;
12073 cased = 1;
12074 }
12075 else
12076 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012078 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079}
12080
INADA Naoki3ae20562017-01-16 20:41:20 +090012081/*[clinic input]
12082str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083
INADA Naoki3ae20562017-01-16 20:41:20 +090012084Return True if the string is a whitespace string, False otherwise.
12085
12086A string is whitespace if all characters in the string are whitespace and there
12087is at least one character in the string.
12088[clinic start generated code]*/
12089
12090static PyObject *
12091unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012092/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094 Py_ssize_t i, length;
12095 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012096 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097
12098 if (PyUnicode_READY(self) == -1)
12099 return NULL;
12100 length = PyUnicode_GET_LENGTH(self);
12101 kind = PyUnicode_KIND(self);
12102 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 if (length == 1)
12106 return PyBool_FromLong(
12107 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012109 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012111 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 for (i = 0; i < length; i++) {
12114 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012115 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012116 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012118 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119}
12120
INADA Naoki3ae20562017-01-16 20:41:20 +090012121/*[clinic input]
12122str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012123
INADA Naoki3ae20562017-01-16 20:41:20 +090012124Return True if the string is an alphabetic string, False otherwise.
12125
12126A string is alphabetic if all characters in the string are alphabetic and there
12127is at least one character in the string.
12128[clinic start generated code]*/
12129
12130static PyObject *
12131unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012132/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 Py_ssize_t i, length;
12135 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012136 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137
12138 if (PyUnicode_READY(self) == -1)
12139 return NULL;
12140 length = PyUnicode_GET_LENGTH(self);
12141 kind = PyUnicode_KIND(self);
12142 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012143
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012144 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 if (length == 1)
12146 return PyBool_FromLong(
12147 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012148
12149 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012151 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 for (i = 0; i < length; i++) {
12154 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012155 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012156 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012157 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012158}
12159
INADA Naoki3ae20562017-01-16 20:41:20 +090012160/*[clinic input]
12161str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012162
INADA Naoki3ae20562017-01-16 20:41:20 +090012163Return True if the string is an alpha-numeric string, False otherwise.
12164
12165A string is alpha-numeric if all characters in the string are alpha-numeric and
12166there is at least one character in the string.
12167[clinic start generated code]*/
12168
12169static PyObject *
12170unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012171/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012174 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 Py_ssize_t len, i;
12176
12177 if (PyUnicode_READY(self) == -1)
12178 return NULL;
12179
12180 kind = PyUnicode_KIND(self);
12181 data = PyUnicode_DATA(self);
12182 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012183
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012184 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 if (len == 1) {
12186 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12187 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12188 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012189
12190 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012192 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 for (i = 0; i < len; i++) {
12195 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012196 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012197 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012198 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012199 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012200}
12201
INADA Naoki3ae20562017-01-16 20:41:20 +090012202/*[clinic input]
12203str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204
INADA Naoki3ae20562017-01-16 20:41:20 +090012205Return True if the string is a decimal string, False otherwise.
12206
12207A string is a decimal string if all characters in the string are decimal and
12208there is at least one character in the string.
12209[clinic start generated code]*/
12210
12211static PyObject *
12212unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012213/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 Py_ssize_t i, length;
12216 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012217 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218
12219 if (PyUnicode_READY(self) == -1)
12220 return NULL;
12221 length = PyUnicode_GET_LENGTH(self);
12222 kind = PyUnicode_KIND(self);
12223 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 if (length == 1)
12227 return PyBool_FromLong(
12228 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012230 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012232 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 for (i = 0; i < length; i++) {
12235 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012236 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012238 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239}
12240
INADA Naoki3ae20562017-01-16 20:41:20 +090012241/*[clinic input]
12242str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243
INADA Naoki3ae20562017-01-16 20:41:20 +090012244Return True if the string is a digit string, False otherwise.
12245
12246A string is a digit string if all characters in the string are digits and there
12247is at least one character in the string.
12248[clinic start generated code]*/
12249
12250static PyObject *
12251unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012252/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 Py_ssize_t i, length;
12255 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012256 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257
12258 if (PyUnicode_READY(self) == -1)
12259 return NULL;
12260 length = PyUnicode_GET_LENGTH(self);
12261 kind = PyUnicode_KIND(self);
12262 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 if (length == 1) {
12266 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12267 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012270 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012272 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 for (i = 0; i < length; i++) {
12275 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012276 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012278 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279}
12280
INADA Naoki3ae20562017-01-16 20:41:20 +090012281/*[clinic input]
12282str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283
INADA Naoki3ae20562017-01-16 20:41:20 +090012284Return True if the string is a numeric string, False otherwise.
12285
12286A string is numeric if all characters in the string are numeric and there is at
12287least one character in the string.
12288[clinic start generated code]*/
12289
12290static PyObject *
12291unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012292/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 Py_ssize_t i, length;
12295 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012296 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297
12298 if (PyUnicode_READY(self) == -1)
12299 return NULL;
12300 length = PyUnicode_GET_LENGTH(self);
12301 kind = PyUnicode_KIND(self);
12302 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 if (length == 1)
12306 return PyBool_FromLong(
12307 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012309 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012311 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 for (i = 0; i < length; i++) {
12314 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012315 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012317 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318}
12319
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012320Py_ssize_t
12321_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012324 if (PyUnicode_READY(self) == -1)
12325 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012326
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012327 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012328 if (len == 0) {
12329 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 }
12332
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012333 int kind = PyUnicode_KIND(self);
12334 const void *data = PyUnicode_DATA(self);
12335 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012336 /* PEP 3131 says that the first character must be in
12337 XID_Start and subsequent characters in XID_Continue,
12338 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012339 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012340 letters, digits, underscore). However, given the current
12341 definition of XID_Start and XID_Continue, it is sufficient
12342 to check just for these, except that _ must be allowed
12343 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012344 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012345 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012346 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012347
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012348 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012349 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012350 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012351 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012352 }
12353 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012354 return i;
12355}
12356
12357int
12358PyUnicode_IsIdentifier(PyObject *self)
12359{
12360 if (PyUnicode_IS_READY(self)) {
12361 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12362 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12363 /* an empty string is not a valid identifier */
12364 return len && i == len;
12365 }
12366 else {
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012367 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012368 if (len == 0) {
12369 /* an empty string is not a valid identifier */
12370 return 0;
12371 }
12372
12373 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012374 Py_UCS4 ch = wstr[i++];
12375#if SIZEOF_WCHAR_T == 2
12376 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12377 && i < len
12378 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12379 {
12380 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12381 i++;
12382 }
12383#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012384 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12385 return 0;
12386 }
12387
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012388 while (i < len) {
12389 ch = wstr[i++];
12390#if SIZEOF_WCHAR_T == 2
12391 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12392 && i < len
12393 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12394 {
12395 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12396 i++;
12397 }
12398#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012399 if (!_PyUnicode_IsXidContinue(ch)) {
12400 return 0;
12401 }
12402 }
12403 return 1;
12404 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012405}
12406
INADA Naoki3ae20562017-01-16 20:41:20 +090012407/*[clinic input]
12408str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012409
INADA Naoki3ae20562017-01-16 20:41:20 +090012410Return True if the string is a valid Python identifier, False otherwise.
12411
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012412Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012413such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012414[clinic start generated code]*/
12415
12416static PyObject *
12417unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012418/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012419{
12420 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12421}
12422
INADA Naoki3ae20562017-01-16 20:41:20 +090012423/*[clinic input]
12424str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012425
INADA Naoki3ae20562017-01-16 20:41:20 +090012426Return True if the string is printable, False otherwise.
12427
12428A string is printable if all of its characters are considered printable in
12429repr() or if it is empty.
12430[clinic start generated code]*/
12431
12432static PyObject *
12433unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012434/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 Py_ssize_t i, length;
12437 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012438 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439
12440 if (PyUnicode_READY(self) == -1)
12441 return NULL;
12442 length = PyUnicode_GET_LENGTH(self);
12443 kind = PyUnicode_KIND(self);
12444 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012445
12446 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 if (length == 1)
12448 return PyBool_FromLong(
12449 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 for (i = 0; i < length; i++) {
12452 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012453 Py_RETURN_FALSE;
12454 }
12455 }
12456 Py_RETURN_TRUE;
12457}
12458
INADA Naoki3ae20562017-01-16 20:41:20 +090012459/*[clinic input]
12460str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461
INADA Naoki3ae20562017-01-16 20:41:20 +090012462 iterable: object
12463 /
12464
12465Concatenate any number of strings.
12466
Martin Panter91a88662017-01-24 00:30:06 +000012467The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012468The result is returned as a new string.
12469
12470Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12471[clinic start generated code]*/
12472
12473static PyObject *
12474unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012475/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476{
INADA Naoki3ae20562017-01-16 20:41:20 +090012477 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478}
12479
Martin v. Löwis18e16552006-02-15 17:27:45 +000012480static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012481unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 if (PyUnicode_READY(self) == -1)
12484 return -1;
12485 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486}
12487
INADA Naoki3ae20562017-01-16 20:41:20 +090012488/*[clinic input]
12489str.ljust as unicode_ljust
12490
12491 width: Py_ssize_t
12492 fillchar: Py_UCS4 = ' '
12493 /
12494
12495Return a left-justified string of length width.
12496
12497Padding is done using the specified fill character (default is a space).
12498[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499
12500static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012501unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12502/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012504 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506
Victor Stinnerc4b49542011-12-11 22:44:26 +010012507 if (PyUnicode_GET_LENGTH(self) >= width)
12508 return unicode_result_unchanged(self);
12509
12510 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511}
12512
INADA Naoki3ae20562017-01-16 20:41:20 +090012513/*[clinic input]
12514str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
INADA Naoki3ae20562017-01-16 20:41:20 +090012516Return a copy of the string converted to lowercase.
12517[clinic start generated code]*/
12518
12519static PyObject *
12520unicode_lower_impl(PyObject *self)
12521/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012523 if (PyUnicode_READY(self) == -1)
12524 return NULL;
12525 if (PyUnicode_IS_ASCII(self))
12526 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012527 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528}
12529
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012530#define LEFTSTRIP 0
12531#define RIGHTSTRIP 1
12532#define BOTHSTRIP 2
12533
12534/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012535static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012536
INADA Naoki3ae20562017-01-16 20:41:20 +090012537#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012538
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012539/* externally visible for str.strip(unicode) */
12540PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012541_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012542{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012543 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 int kind;
12545 Py_ssize_t i, j, len;
12546 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012547 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12550 return NULL;
12551
12552 kind = PyUnicode_KIND(self);
12553 data = PyUnicode_DATA(self);
12554 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012555 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12557 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012558 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012559
Benjamin Peterson14339b62009-01-31 16:36:08 +000012560 i = 0;
12561 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012562 while (i < len) {
12563 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12564 if (!BLOOM(sepmask, ch))
12565 break;
12566 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12567 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012568 i++;
12569 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012570 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012571
Benjamin Peterson14339b62009-01-31 16:36:08 +000012572 j = len;
12573 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012574 j--;
12575 while (j >= i) {
12576 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12577 if (!BLOOM(sepmask, ch))
12578 break;
12579 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12580 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012581 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012582 }
12583
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012585 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012586
Victor Stinner7931d9a2011-11-04 00:22:48 +010012587 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588}
12589
12590PyObject*
12591PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12592{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012593 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012595 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596
Victor Stinnerde636f32011-10-01 03:55:54 +020012597 if (PyUnicode_READY(self) == -1)
12598 return NULL;
12599
Victor Stinner684d5fd2012-05-03 02:32:34 +020012600 length = PyUnicode_GET_LENGTH(self);
12601 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012602
Victor Stinner684d5fd2012-05-03 02:32:34 +020012603 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012604 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605
Victor Stinnerde636f32011-10-01 03:55:54 +020012606 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012607 PyErr_SetString(PyExc_IndexError, "string index out of range");
12608 return NULL;
12609 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012610 if (start >= length || end < start)
12611 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012612
Victor Stinner684d5fd2012-05-03 02:32:34 +020012613 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012614 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012615 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012616 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012617 }
12618 else {
12619 kind = PyUnicode_KIND(self);
12620 data = PyUnicode_1BYTE_DATA(self);
12621 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012622 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012623 length);
12624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626
12627static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012628do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 Py_ssize_t len, i, j;
12631
12632 if (PyUnicode_READY(self) == -1)
12633 return NULL;
12634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012636
Victor Stinnercc7af722013-04-09 22:39:24 +020012637 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012638 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012639
12640 i = 0;
12641 if (striptype != RIGHTSTRIP) {
12642 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012643 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012644 if (!_Py_ascii_whitespace[ch])
12645 break;
12646 i++;
12647 }
12648 }
12649
12650 j = len;
12651 if (striptype != LEFTSTRIP) {
12652 j--;
12653 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012654 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012655 if (!_Py_ascii_whitespace[ch])
12656 break;
12657 j--;
12658 }
12659 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012660 }
12661 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012662 else {
12663 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012664 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012665
Victor Stinnercc7af722013-04-09 22:39:24 +020012666 i = 0;
12667 if (striptype != RIGHTSTRIP) {
12668 while (i < len) {
12669 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12670 if (!Py_UNICODE_ISSPACE(ch))
12671 break;
12672 i++;
12673 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012674 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012675
12676 j = len;
12677 if (striptype != LEFTSTRIP) {
12678 j--;
12679 while (j >= i) {
12680 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12681 if (!Py_UNICODE_ISSPACE(ch))
12682 break;
12683 j--;
12684 }
12685 j++;
12686 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012687 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012688
Victor Stinner7931d9a2011-11-04 00:22:48 +010012689 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690}
12691
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012692
12693static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012694do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012695{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012696 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012697 if (PyUnicode_Check(sep))
12698 return _PyUnicode_XStrip(self, striptype, sep);
12699 else {
12700 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012701 "%s arg must be None or str",
12702 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012703 return NULL;
12704 }
12705 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012706
Benjamin Peterson14339b62009-01-31 16:36:08 +000012707 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012708}
12709
12710
INADA Naoki3ae20562017-01-16 20:41:20 +090012711/*[clinic input]
12712str.strip as unicode_strip
12713
12714 chars: object = None
12715 /
12716
Zachary Ware09895c22019-10-09 16:09:00 -050012717Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012718
12719If chars is given and not None, remove characters in chars instead.
12720[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012721
12722static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012723unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012724/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012725{
INADA Naoki3ae20562017-01-16 20:41:20 +090012726 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012727}
12728
12729
INADA Naoki3ae20562017-01-16 20:41:20 +090012730/*[clinic input]
12731str.lstrip as unicode_lstrip
12732
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012733 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012734 /
12735
12736Return a copy of the string with leading whitespace removed.
12737
12738If chars is given and not None, remove characters in chars instead.
12739[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012740
12741static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012742unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012743/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012744{
INADA Naoki3ae20562017-01-16 20:41:20 +090012745 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012746}
12747
12748
INADA Naoki3ae20562017-01-16 20:41:20 +090012749/*[clinic input]
12750str.rstrip as unicode_rstrip
12751
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012752 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012753 /
12754
12755Return a copy of the string with trailing whitespace removed.
12756
12757If chars is given and not None, remove characters in chars instead.
12758[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012759
12760static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012761unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012762/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012763{
INADA Naoki3ae20562017-01-16 20:41:20 +090012764 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012765}
12766
12767
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012769unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012771 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012773
Serhiy Storchaka05997252013-01-26 12:14:02 +020012774 if (len < 1)
12775 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776
Victor Stinnerc4b49542011-12-11 22:44:26 +010012777 /* no repeat, return original string */
12778 if (len == 1)
12779 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012780
Benjamin Petersonbac79492012-01-14 13:34:47 -050012781 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 return NULL;
12783
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012784 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012785 PyErr_SetString(PyExc_OverflowError,
12786 "repeated string is too long");
12787 return NULL;
12788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012790
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012791 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792 if (!u)
12793 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012794 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012797 int kind = PyUnicode_KIND(str);
12798 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012799 if (kind == PyUnicode_1BYTE_KIND) {
12800 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012801 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012802 }
12803 else if (kind == PyUnicode_2BYTE_KIND) {
12804 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012805 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012806 ucs2[n] = fill_char;
12807 } else {
12808 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12809 assert(kind == PyUnicode_4BYTE_KIND);
12810 for (n = 0; n < len; ++n)
12811 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813 }
12814 else {
12815 /* number of characters copied this far */
12816 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012817 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012819 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012823 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012824 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826 }
12827
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012828 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012829 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830}
12831
Alexander Belopolsky40018472011-02-26 01:02:56 +000012832PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012833PyUnicode_Replace(PyObject *str,
12834 PyObject *substr,
12835 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012836 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012838 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12839 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012841 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842}
12843
INADA Naoki3ae20562017-01-16 20:41:20 +090012844/*[clinic input]
12845str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846
INADA Naoki3ae20562017-01-16 20:41:20 +090012847 old: unicode
12848 new: unicode
12849 count: Py_ssize_t = -1
12850 Maximum number of occurrences to replace.
12851 -1 (the default value) means replace all occurrences.
12852 /
12853
12854Return a copy with all occurrences of substring old replaced by new.
12855
12856If the optional argument count is given, only the first count occurrences are
12857replaced.
12858[clinic start generated code]*/
12859
12860static PyObject *
12861unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12862 Py_ssize_t count)
12863/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012865 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012867 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868}
12869
sweeneydea81849b2020-04-22 17:05:48 -040012870/*[clinic input]
12871str.removeprefix as unicode_removeprefix
12872
12873 prefix: unicode
12874 /
12875
12876Return a str with the given prefix string removed if present.
12877
12878If the string starts with the prefix string, return string[len(prefix):].
12879Otherwise, return a copy of the original string.
12880[clinic start generated code]*/
12881
12882static PyObject *
12883unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12884/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12885{
12886 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12887 if (match == -1) {
12888 return NULL;
12889 }
12890 if (match) {
12891 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12892 PyUnicode_GET_LENGTH(self));
12893 }
12894 return unicode_result_unchanged(self);
12895}
12896
12897/*[clinic input]
12898str.removesuffix as unicode_removesuffix
12899
12900 suffix: unicode
12901 /
12902
12903Return a str with the given suffix string removed if present.
12904
12905If the string ends with the suffix string and that suffix is not empty,
12906return string[:-len(suffix)]. Otherwise, return a copy of the original
12907string.
12908[clinic start generated code]*/
12909
12910static PyObject *
12911unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12912/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12913{
12914 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12915 if (match == -1) {
12916 return NULL;
12917 }
12918 if (match) {
12919 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12920 - PyUnicode_GET_LENGTH(suffix));
12921 }
12922 return unicode_result_unchanged(self);
12923}
12924
Alexander Belopolsky40018472011-02-26 01:02:56 +000012925static PyObject *
12926unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012928 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 Py_ssize_t isize;
12930 Py_ssize_t osize, squote, dquote, i, o;
12931 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012932 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012933 const void *idata;
12934 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012937 return NULL;
12938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 isize = PyUnicode_GET_LENGTH(unicode);
12940 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 /* Compute length of output, quote characters, and
12943 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012944 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945 max = 127;
12946 squote = dquote = 0;
12947 ikind = PyUnicode_KIND(unicode);
12948 for (i = 0; i < isize; i++) {
12949 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012950 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012952 case '\'': squote++; break;
12953 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012955 incr = 2;
12956 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 default:
12958 /* Fast-path ASCII */
12959 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012960 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012962 ;
12963 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012966 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012968 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012970 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012971 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012972 if (osize > PY_SSIZE_T_MAX - incr) {
12973 PyErr_SetString(PyExc_OverflowError,
12974 "string is too long to generate repr");
12975 return NULL;
12976 }
12977 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978 }
12979
12980 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012981 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012983 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 if (dquote)
12985 /* Both squote and dquote present. Use squote,
12986 and escape them */
12987 osize += squote;
12988 else
12989 quote = '"';
12990 }
Victor Stinner55c08782013-04-14 18:45:39 +020012991 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992
12993 repr = PyUnicode_New(osize, max);
12994 if (repr == NULL)
12995 return NULL;
12996 okind = PyUnicode_KIND(repr);
12997 odata = PyUnicode_DATA(repr);
12998
12999 PyUnicode_WRITE(okind, odata, 0, quote);
13000 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013001 if (unchanged) {
13002 _PyUnicode_FastCopyCharacters(repr, 1,
13003 unicode, 0,
13004 isize);
13005 }
13006 else {
13007 for (i = 0, o = 1; i < isize; i++) {
13008 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009
Victor Stinner55c08782013-04-14 18:45:39 +020013010 /* Escape quotes and backslashes */
13011 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013012 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013014 continue;
13015 }
13016
13017 /* Map special whitespace to '\t', \n', '\r' */
13018 if (ch == '\t') {
13019 PyUnicode_WRITE(okind, odata, o++, '\\');
13020 PyUnicode_WRITE(okind, odata, o++, 't');
13021 }
13022 else if (ch == '\n') {
13023 PyUnicode_WRITE(okind, odata, o++, '\\');
13024 PyUnicode_WRITE(okind, odata, o++, 'n');
13025 }
13026 else if (ch == '\r') {
13027 PyUnicode_WRITE(okind, odata, o++, '\\');
13028 PyUnicode_WRITE(okind, odata, o++, 'r');
13029 }
13030
13031 /* Map non-printable US ASCII to '\xhh' */
13032 else if (ch < ' ' || ch == 0x7F) {
13033 PyUnicode_WRITE(okind, odata, o++, '\\');
13034 PyUnicode_WRITE(okind, odata, o++, 'x');
13035 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13036 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13037 }
13038
13039 /* Copy ASCII characters as-is */
13040 else if (ch < 0x7F) {
13041 PyUnicode_WRITE(okind, odata, o++, ch);
13042 }
13043
13044 /* Non-ASCII characters */
13045 else {
13046 /* Map Unicode whitespace and control characters
13047 (categories Z* and C* except ASCII space)
13048 */
13049 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13050 PyUnicode_WRITE(okind, odata, o++, '\\');
13051 /* Map 8-bit characters to '\xhh' */
13052 if (ch <= 0xff) {
13053 PyUnicode_WRITE(okind, odata, o++, 'x');
13054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13056 }
13057 /* Map 16-bit characters to '\uxxxx' */
13058 else if (ch <= 0xffff) {
13059 PyUnicode_WRITE(okind, odata, o++, 'u');
13060 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13064 }
13065 /* Map 21-bit characters to '\U00xxxxxx' */
13066 else {
13067 PyUnicode_WRITE(okind, odata, o++, 'U');
13068 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13069 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13070 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13071 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13072 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13073 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13074 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13075 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13076 }
13077 }
13078 /* Copy characters as-is */
13079 else {
13080 PyUnicode_WRITE(okind, odata, o++, ch);
13081 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013082 }
13083 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013086 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013087 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088}
13089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013090PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013091 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092\n\
13093Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013094such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095arguments start and end are interpreted as in slice notation.\n\
13096\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013097Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098
13099static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013100unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013102 /* initialize variables to prevent gcc warning */
13103 PyObject *substring = NULL;
13104 Py_ssize_t start = 0;
13105 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013106 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013108 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013111 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013114 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013116 if (result == -2)
13117 return NULL;
13118
Christian Heimes217cfd12007-12-02 14:31:20 +000013119 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120}
13121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013122PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013123 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013125Return the highest index in S where substring sub is found,\n\
13126such that sub is contained within S[start:end]. Optional\n\
13127arguments start and end are interpreted as in slice notation.\n\
13128\n\
13129Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130
13131static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013134 /* initialize variables to prevent gcc warning */
13135 PyObject *substring = NULL;
13136 Py_ssize_t start = 0;
13137 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013138 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013140 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013143 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013144 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013146 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013148 if (result == -2)
13149 return NULL;
13150
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151 if (result < 0) {
13152 PyErr_SetString(PyExc_ValueError, "substring not found");
13153 return NULL;
13154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013155
Christian Heimes217cfd12007-12-02 14:31:20 +000013156 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157}
13158
INADA Naoki3ae20562017-01-16 20:41:20 +090013159/*[clinic input]
13160str.rjust as unicode_rjust
13161
13162 width: Py_ssize_t
13163 fillchar: Py_UCS4 = ' '
13164 /
13165
13166Return a right-justified string of length width.
13167
13168Padding is done using the specified fill character (default is a space).
13169[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170
13171static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013172unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13173/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013175 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176 return NULL;
13177
Victor Stinnerc4b49542011-12-11 22:44:26 +010013178 if (PyUnicode_GET_LENGTH(self) >= width)
13179 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180
Victor Stinnerc4b49542011-12-11 22:44:26 +010013181 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182}
13183
Alexander Belopolsky40018472011-02-26 01:02:56 +000013184PyObject *
13185PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013186{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013187 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013190 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191}
13192
INADA Naoki3ae20562017-01-16 20:41:20 +090013193/*[clinic input]
13194str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013195
INADA Naoki3ae20562017-01-16 20:41:20 +090013196 sep: object = None
13197 The delimiter according which to split the string.
13198 None (the default value) means split according to any whitespace,
13199 and discard empty strings from the result.
13200 maxsplit: Py_ssize_t = -1
13201 Maximum number of splits to do.
13202 -1 (the default value) means no limit.
13203
13204Return a list of the words in the string, using sep as the delimiter string.
13205[clinic start generated code]*/
13206
13207static PyObject *
13208unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13209/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210{
INADA Naoki3ae20562017-01-16 20:41:20 +090013211 if (sep == Py_None)
13212 return split(self, NULL, maxsplit);
13213 if (PyUnicode_Check(sep))
13214 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013215
Victor Stinner998b8062018-09-12 00:23:25 +020013216 PyErr_Format(PyExc_TypeError,
13217 "must be str or None, not %.100s",
13218 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013219 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013220}
13221
Thomas Wouters477c8d52006-05-27 19:21:47 +000013222PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013223PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013224{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013225 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013226 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013227 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013229
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013230 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013231 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013232
Victor Stinner14f8f022011-10-05 20:58:25 +020013233 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013235 len1 = PyUnicode_GET_LENGTH(str_obj);
13236 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013237 if (kind1 < kind2 || len1 < len2) {
13238 _Py_INCREF_UNICODE_EMPTY();
13239 if (!unicode_empty)
13240 out = NULL;
13241 else {
13242 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13243 Py_DECREF(unicode_empty);
13244 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013245 return out;
13246 }
13247 buf1 = PyUnicode_DATA(str_obj);
13248 buf2 = PyUnicode_DATA(sep_obj);
13249 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013250 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013251 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013252 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013255 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013256 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013257 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13258 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13259 else
13260 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013261 break;
13262 case PyUnicode_2BYTE_KIND:
13263 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13264 break;
13265 case PyUnicode_4BYTE_KIND:
13266 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13267 break;
13268 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013269 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013271
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013272 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013273 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013274 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013275
13276 return out;
13277}
13278
13279
13280PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013281PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013282{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013283 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013284 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013285 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013287
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013288 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013289 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013290
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013291 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 len1 = PyUnicode_GET_LENGTH(str_obj);
13294 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013295 if (kind1 < kind2 || len1 < len2) {
13296 _Py_INCREF_UNICODE_EMPTY();
13297 if (!unicode_empty)
13298 out = NULL;
13299 else {
13300 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13301 Py_DECREF(unicode_empty);
13302 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013303 return out;
13304 }
13305 buf1 = PyUnicode_DATA(str_obj);
13306 buf2 = PyUnicode_DATA(sep_obj);
13307 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013308 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013309 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013310 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013313 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013314 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013315 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13316 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13317 else
13318 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013319 break;
13320 case PyUnicode_2BYTE_KIND:
13321 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13322 break;
13323 case PyUnicode_4BYTE_KIND:
13324 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13325 break;
13326 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013327 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013328 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013329
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013330 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013331 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013332 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013333
13334 return out;
13335}
13336
INADA Naoki3ae20562017-01-16 20:41:20 +090013337/*[clinic input]
13338str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013339
INADA Naoki3ae20562017-01-16 20:41:20 +090013340 sep: object
13341 /
13342
13343Partition the string into three parts using the given separator.
13344
13345This will search for the separator in the string. If the separator is found,
13346returns a 3-tuple containing the part before the separator, the separator
13347itself, and the part after it.
13348
13349If the separator is not found, returns a 3-tuple containing the original string
13350and two empty strings.
13351[clinic start generated code]*/
13352
13353static PyObject *
13354unicode_partition(PyObject *self, PyObject *sep)
13355/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013356{
INADA Naoki3ae20562017-01-16 20:41:20 +090013357 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013358}
13359
INADA Naoki3ae20562017-01-16 20:41:20 +090013360/*[clinic input]
13361str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013362
INADA Naoki3ae20562017-01-16 20:41:20 +090013363Partition the string into three parts using the given separator.
13364
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013365This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013366the separator is found, returns a 3-tuple containing the part before the
13367separator, the separator itself, and the part after it.
13368
13369If the separator is not found, returns a 3-tuple containing two empty strings
13370and the original string.
13371[clinic start generated code]*/
13372
13373static PyObject *
13374unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013375/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013376{
INADA Naoki3ae20562017-01-16 20:41:20 +090013377 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013378}
13379
Alexander Belopolsky40018472011-02-26 01:02:56 +000013380PyObject *
13381PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013382{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013383 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013384 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013385
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013386 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013387}
13388
INADA Naoki3ae20562017-01-16 20:41:20 +090013389/*[clinic input]
13390str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013391
INADA Naoki3ae20562017-01-16 20:41:20 +090013392Return a list of the words in the string, using sep as the delimiter string.
13393
13394Splits are done starting at the end of the string and working to the front.
13395[clinic start generated code]*/
13396
13397static PyObject *
13398unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13399/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013400{
INADA Naoki3ae20562017-01-16 20:41:20 +090013401 if (sep == Py_None)
13402 return rsplit(self, NULL, maxsplit);
13403 if (PyUnicode_Check(sep))
13404 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013405
Victor Stinner998b8062018-09-12 00:23:25 +020013406 PyErr_Format(PyExc_TypeError,
13407 "must be str or None, not %.100s",
13408 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013409 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013410}
13411
INADA Naoki3ae20562017-01-16 20:41:20 +090013412/*[clinic input]
13413str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013414
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013415 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013416
13417Return a list of the lines in the string, breaking at line boundaries.
13418
13419Line breaks are not included in the resulting list unless keepends is given and
13420true.
13421[clinic start generated code]*/
13422
13423static PyObject *
13424unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013425/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013427 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428}
13429
13430static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013431PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013433 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434}
13435
INADA Naoki3ae20562017-01-16 20:41:20 +090013436/*[clinic input]
13437str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438
INADA Naoki3ae20562017-01-16 20:41:20 +090013439Convert uppercase characters to lowercase and lowercase characters to uppercase.
13440[clinic start generated code]*/
13441
13442static PyObject *
13443unicode_swapcase_impl(PyObject *self)
13444/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013446 if (PyUnicode_READY(self) == -1)
13447 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013448 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449}
13450
Larry Hastings61272b72014-01-07 12:41:53 -080013451/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013452
Larry Hastings31826802013-10-19 00:09:25 -070013453@staticmethod
13454str.maketrans as unicode_maketrans
13455
13456 x: object
13457
13458 y: unicode=NULL
13459
13460 z: unicode=NULL
13461
13462 /
13463
13464Return a translation table usable for str.translate().
13465
13466If there is only one argument, it must be a dictionary mapping Unicode
13467ordinals (integers) or characters to Unicode ordinals, strings or None.
13468Character keys will be then converted to ordinals.
13469If there are two arguments, they must be strings of equal length, and
13470in the resulting dictionary, each character in x will be mapped to the
13471character at the same position in y. If there is a third argument, it
13472must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013473[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013474
Larry Hastings31826802013-10-19 00:09:25 -070013475static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013476unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013477/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013478{
Georg Brandlceee0772007-11-27 23:48:05 +000013479 PyObject *new = NULL, *key, *value;
13480 Py_ssize_t i = 0;
13481 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013482
Georg Brandlceee0772007-11-27 23:48:05 +000013483 new = PyDict_New();
13484 if (!new)
13485 return NULL;
13486 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013487 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013488 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489
Georg Brandlceee0772007-11-27 23:48:05 +000013490 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013491 if (!PyUnicode_Check(x)) {
13492 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13493 "be a string if there is a second argument");
13494 goto err;
13495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013496 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013497 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13498 "arguments must have equal length");
13499 goto err;
13500 }
13501 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013502 x_kind = PyUnicode_KIND(x);
13503 y_kind = PyUnicode_KIND(y);
13504 x_data = PyUnicode_DATA(x);
13505 y_data = PyUnicode_DATA(y);
13506 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13507 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013508 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013509 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013510 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013511 if (!value) {
13512 Py_DECREF(key);
13513 goto err;
13514 }
Georg Brandlceee0772007-11-27 23:48:05 +000013515 res = PyDict_SetItem(new, key, value);
13516 Py_DECREF(key);
13517 Py_DECREF(value);
13518 if (res < 0)
13519 goto err;
13520 }
13521 /* create entries for deleting chars in z */
13522 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013523 z_kind = PyUnicode_KIND(z);
13524 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013525 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013526 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013527 if (!key)
13528 goto err;
13529 res = PyDict_SetItem(new, key, Py_None);
13530 Py_DECREF(key);
13531 if (res < 0)
13532 goto err;
13533 }
13534 }
13535 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013536 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013537 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013538
Georg Brandlceee0772007-11-27 23:48:05 +000013539 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013540 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013541 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13542 "to maketrans it must be a dict");
13543 goto err;
13544 }
13545 /* copy entries into the new dict, converting string keys to int keys */
13546 while (PyDict_Next(x, &i, &key, &value)) {
13547 if (PyUnicode_Check(key)) {
13548 /* convert string keys to integer keys */
13549 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013550 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013551 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13552 "table must be of length 1");
13553 goto err;
13554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 kind = PyUnicode_KIND(key);
13556 data = PyUnicode_DATA(key);
13557 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013558 if (!newkey)
13559 goto err;
13560 res = PyDict_SetItem(new, newkey, value);
13561 Py_DECREF(newkey);
13562 if (res < 0)
13563 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013564 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013565 /* just keep integer keys */
13566 if (PyDict_SetItem(new, key, value) < 0)
13567 goto err;
13568 } else {
13569 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13570 "be strings or integers");
13571 goto err;
13572 }
13573 }
13574 }
13575 return new;
13576 err:
13577 Py_DECREF(new);
13578 return NULL;
13579}
13580
INADA Naoki3ae20562017-01-16 20:41:20 +090013581/*[clinic input]
13582str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013583
INADA Naoki3ae20562017-01-16 20:41:20 +090013584 table: object
13585 Translation table, which must be a mapping of Unicode ordinals to
13586 Unicode ordinals, strings, or None.
13587 /
13588
13589Replace each character in the string using the given translation table.
13590
13591The table must implement lookup/indexing via __getitem__, for instance a
13592dictionary or list. If this operation raises LookupError, the character is
13593left untouched. Characters mapped to None are deleted.
13594[clinic start generated code]*/
13595
13596static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013597unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013598/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013599{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013600 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013601}
13602
INADA Naoki3ae20562017-01-16 20:41:20 +090013603/*[clinic input]
13604str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013605
INADA Naoki3ae20562017-01-16 20:41:20 +090013606Return a copy of the string converted to uppercase.
13607[clinic start generated code]*/
13608
13609static PyObject *
13610unicode_upper_impl(PyObject *self)
13611/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013612{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013613 if (PyUnicode_READY(self) == -1)
13614 return NULL;
13615 if (PyUnicode_IS_ASCII(self))
13616 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013617 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013618}
13619
INADA Naoki3ae20562017-01-16 20:41:20 +090013620/*[clinic input]
13621str.zfill as unicode_zfill
13622
13623 width: Py_ssize_t
13624 /
13625
13626Pad a numeric string with zeros on the left, to fill a field of the given width.
13627
13628The string is never truncated.
13629[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013630
13631static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013632unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013633/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013634{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013635 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013636 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013637 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013638 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013639 Py_UCS4 chr;
13640
Benjamin Petersonbac79492012-01-14 13:34:47 -050013641 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013643
Victor Stinnerc4b49542011-12-11 22:44:26 +010013644 if (PyUnicode_GET_LENGTH(self) >= width)
13645 return unicode_result_unchanged(self);
13646
13647 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013648
13649 u = pad(self, fill, 0, '0');
13650
Walter Dörwald068325e2002-04-15 13:36:47 +000013651 if (u == NULL)
13652 return NULL;
13653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013654 kind = PyUnicode_KIND(u);
13655 data = PyUnicode_DATA(u);
13656 chr = PyUnicode_READ(kind, data, fill);
13657
13658 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013659 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013660 PyUnicode_WRITE(kind, data, 0, chr);
13661 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013662 }
13663
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013664 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013665 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013666}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013667
13668#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013669static PyObject *
13670unicode__decimal2ascii(PyObject *self)
13671{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013672 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013673}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013674#endif
13675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013676PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013677 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013678\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013679Return True if S starts with the specified prefix, False otherwise.\n\
13680With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013681With optional end, stop comparing S at that position.\n\
13682prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013683
13684static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013685unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013686 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013687{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013688 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013689 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013690 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013691 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013692 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013693
Jesus Ceaac451502011-04-20 17:09:23 +020013694 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013695 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013696 if (PyTuple_Check(subobj)) {
13697 Py_ssize_t i;
13698 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013699 substring = PyTuple_GET_ITEM(subobj, i);
13700 if (!PyUnicode_Check(substring)) {
13701 PyErr_Format(PyExc_TypeError,
13702 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013703 "not %.100s",
13704 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013705 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013706 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013707 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013708 if (result == -1)
13709 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013710 if (result) {
13711 Py_RETURN_TRUE;
13712 }
13713 }
13714 /* nothing matched */
13715 Py_RETURN_FALSE;
13716 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013717 if (!PyUnicode_Check(subobj)) {
13718 PyErr_Format(PyExc_TypeError,
13719 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013720 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013721 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013722 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013723 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013724 if (result == -1)
13725 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013726 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013727}
13728
13729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013730PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013731 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013732\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013733Return True if S ends with the specified suffix, False otherwise.\n\
13734With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013735With optional end, stop comparing S at that position.\n\
13736suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013737
13738static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013739unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013740 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013741{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013742 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013743 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013744 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013745 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013746 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013747
Jesus Ceaac451502011-04-20 17:09:23 +020013748 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013749 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013750 if (PyTuple_Check(subobj)) {
13751 Py_ssize_t i;
13752 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013753 substring = PyTuple_GET_ITEM(subobj, i);
13754 if (!PyUnicode_Check(substring)) {
13755 PyErr_Format(PyExc_TypeError,
13756 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013757 "not %.100s",
13758 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013759 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013760 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013761 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013762 if (result == -1)
13763 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013764 if (result) {
13765 Py_RETURN_TRUE;
13766 }
13767 }
13768 Py_RETURN_FALSE;
13769 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013770 if (!PyUnicode_Check(subobj)) {
13771 PyErr_Format(PyExc_TypeError,
13772 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013773 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013774 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013775 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013776 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013777 if (result == -1)
13778 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013779 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013780}
13781
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013782static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013783_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013784{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013785 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13786 writer->data = PyUnicode_DATA(writer->buffer);
13787
13788 if (!writer->readonly) {
13789 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013790 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013791 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013792 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013793 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13794 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13795 writer->kind = PyUnicode_WCHAR_KIND;
13796 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13797
Victor Stinner8f674cc2013-04-17 23:02:17 +020013798 /* Copy-on-write mode: set buffer size to 0 so
13799 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13800 * next write. */
13801 writer->size = 0;
13802 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013803}
13804
Victor Stinnerd3f08822012-05-29 12:57:52 +020013805void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013806_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013807{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013808 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013809
13810 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013811 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013812
13813 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13814 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13815 writer->kind = PyUnicode_WCHAR_KIND;
13816 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013817}
13818
Inada Naoki770847a2019-06-24 12:30:24 +090013819// Initialize _PyUnicodeWriter with initial buffer
13820static inline void
13821_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13822{
13823 memset(writer, 0, sizeof(*writer));
13824 writer->buffer = buffer;
13825 _PyUnicodeWriter_Update(writer);
13826 writer->min_length = writer->size;
13827}
13828
Victor Stinnerd3f08822012-05-29 12:57:52 +020013829int
13830_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13831 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013832{
13833 Py_ssize_t newlen;
13834 PyObject *newbuffer;
13835
Victor Stinner2740e462016-09-06 16:58:36 -070013836 assert(maxchar <= MAX_UNICODE);
13837
Victor Stinnerca9381e2015-09-22 00:58:32 +020013838 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013839 assert((maxchar > writer->maxchar && length >= 0)
13840 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841
Victor Stinner202fdca2012-05-07 12:47:02 +020013842 if (length > PY_SSIZE_T_MAX - writer->pos) {
13843 PyErr_NoMemory();
13844 return -1;
13845 }
13846 newlen = writer->pos + length;
13847
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013848 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013849
Victor Stinnerd3f08822012-05-29 12:57:52 +020013850 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013851 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013852 if (writer->overallocate
13853 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13854 /* overallocate to limit the number of realloc() */
13855 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013856 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013857 if (newlen < writer->min_length)
13858 newlen = writer->min_length;
13859
Victor Stinnerd3f08822012-05-29 12:57:52 +020013860 writer->buffer = PyUnicode_New(newlen, maxchar);
13861 if (writer->buffer == NULL)
13862 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013863 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013864 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013865 if (writer->overallocate
13866 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13867 /* overallocate to limit the number of realloc() */
13868 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013869 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013870 if (newlen < writer->min_length)
13871 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013872
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013873 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013874 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013875 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013876 newbuffer = PyUnicode_New(newlen, maxchar);
13877 if (newbuffer == NULL)
13878 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013879 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13880 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013881 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013882 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013883 }
13884 else {
13885 newbuffer = resize_compact(writer->buffer, newlen);
13886 if (newbuffer == NULL)
13887 return -1;
13888 }
13889 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013890 }
13891 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013892 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013893 newbuffer = PyUnicode_New(writer->size, maxchar);
13894 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013895 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013896 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13897 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013898 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013899 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013900 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013901 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013902
13903#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013904}
13905
Victor Stinnerca9381e2015-09-22 00:58:32 +020013906int
13907_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13908 enum PyUnicode_Kind kind)
13909{
13910 Py_UCS4 maxchar;
13911
13912 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13913 assert(writer->kind < kind);
13914
13915 switch (kind)
13916 {
13917 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13918 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13919 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13920 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013921 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013922 }
13923
13924 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13925}
13926
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013927static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013928_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013929{
Victor Stinner2740e462016-09-06 16:58:36 -070013930 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013931 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13932 return -1;
13933 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13934 writer->pos++;
13935 return 0;
13936}
13937
13938int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013939_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13940{
13941 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13942}
13943
13944int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013945_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13946{
13947 Py_UCS4 maxchar;
13948 Py_ssize_t len;
13949
13950 if (PyUnicode_READY(str) == -1)
13951 return -1;
13952 len = PyUnicode_GET_LENGTH(str);
13953 if (len == 0)
13954 return 0;
13955 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13956 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013957 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013958 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013959 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013960 Py_INCREF(str);
13961 writer->buffer = str;
13962 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013963 writer->pos += len;
13964 return 0;
13965 }
13966 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13967 return -1;
13968 }
13969 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13970 str, 0, len);
13971 writer->pos += len;
13972 return 0;
13973}
13974
Victor Stinnere215d962012-10-06 23:03:36 +020013975int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013976_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13977 Py_ssize_t start, Py_ssize_t end)
13978{
13979 Py_UCS4 maxchar;
13980 Py_ssize_t len;
13981
13982 if (PyUnicode_READY(str) == -1)
13983 return -1;
13984
13985 assert(0 <= start);
13986 assert(end <= PyUnicode_GET_LENGTH(str));
13987 assert(start <= end);
13988
13989 if (end == 0)
13990 return 0;
13991
13992 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13993 return _PyUnicodeWriter_WriteStr(writer, str);
13994
13995 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13996 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13997 else
13998 maxchar = writer->maxchar;
13999 len = end - start;
14000
14001 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14002 return -1;
14003
14004 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14005 str, start, len);
14006 writer->pos += len;
14007 return 0;
14008}
14009
14010int
Victor Stinner4a587072013-11-19 12:54:53 +010014011_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14012 const char *ascii, Py_ssize_t len)
14013{
14014 if (len == -1)
14015 len = strlen(ascii);
14016
Andy Lestere6be9b52020-02-11 20:28:35 -060014017 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014018
14019 if (writer->buffer == NULL && !writer->overallocate) {
14020 PyObject *str;
14021
14022 str = _PyUnicode_FromASCII(ascii, len);
14023 if (str == NULL)
14024 return -1;
14025
14026 writer->readonly = 1;
14027 writer->buffer = str;
14028 _PyUnicodeWriter_Update(writer);
14029 writer->pos += len;
14030 return 0;
14031 }
14032
14033 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14034 return -1;
14035
14036 switch (writer->kind)
14037 {
14038 case PyUnicode_1BYTE_KIND:
14039 {
14040 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14041 Py_UCS1 *data = writer->data;
14042
Christian Heimesf051e432016-09-13 20:22:02 +020014043 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014044 break;
14045 }
14046 case PyUnicode_2BYTE_KIND:
14047 {
14048 _PyUnicode_CONVERT_BYTES(
14049 Py_UCS1, Py_UCS2,
14050 ascii, ascii + len,
14051 (Py_UCS2 *)writer->data + writer->pos);
14052 break;
14053 }
14054 case PyUnicode_4BYTE_KIND:
14055 {
14056 _PyUnicode_CONVERT_BYTES(
14057 Py_UCS1, Py_UCS4,
14058 ascii, ascii + len,
14059 (Py_UCS4 *)writer->data + writer->pos);
14060 break;
14061 }
14062 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014063 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014064 }
14065
14066 writer->pos += len;
14067 return 0;
14068}
14069
14070int
14071_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14072 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014073{
14074 Py_UCS4 maxchar;
14075
Andy Lestere6be9b52020-02-11 20:28:35 -060014076 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014077 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14078 return -1;
14079 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14080 writer->pos += len;
14081 return 0;
14082}
14083
Victor Stinnerd3f08822012-05-29 12:57:52 +020014084PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014085_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014086{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014087 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014088
Victor Stinnerd3f08822012-05-29 12:57:52 +020014089 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014090 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014091 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014092 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014093
14094 str = writer->buffer;
14095 writer->buffer = NULL;
14096
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014097 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014098 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14099 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014100 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014101
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014102 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14103 PyObject *str2;
14104 str2 = resize_compact(str, writer->pos);
14105 if (str2 == NULL) {
14106 Py_DECREF(str);
14107 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014108 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014109 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014110 }
14111
Victor Stinner15a0bd32013-07-08 22:29:55 +020014112 assert(_PyUnicode_CheckConsistency(str, 1));
14113 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014114}
14115
Victor Stinnerd3f08822012-05-29 12:57:52 +020014116void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014117_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014118{
14119 Py_CLEAR(writer->buffer);
14120}
14121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014122#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014123
14124PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014125 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014126\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014127Return a formatted version of S, using substitutions from args and kwargs.\n\
14128The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014129
Eric Smith27bbca62010-11-04 17:06:58 +000014130PyDoc_STRVAR(format_map__doc__,
14131 "S.format_map(mapping) -> str\n\
14132\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014133Return a formatted version of S, using substitutions from mapping.\n\
14134The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014135
INADA Naoki3ae20562017-01-16 20:41:20 +090014136/*[clinic input]
14137str.__format__ as unicode___format__
14138
14139 format_spec: unicode
14140 /
14141
14142Return a formatted version of the string as described by format_spec.
14143[clinic start generated code]*/
14144
Eric Smith4a7d76d2008-05-30 18:10:19 +000014145static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014146unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014147/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014148{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014149 _PyUnicodeWriter writer;
14150 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014151
Victor Stinnerd3f08822012-05-29 12:57:52 +020014152 if (PyUnicode_READY(self) == -1)
14153 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014154 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014155 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14156 self, format_spec, 0,
14157 PyUnicode_GET_LENGTH(format_spec));
14158 if (ret == -1) {
14159 _PyUnicodeWriter_Dealloc(&writer);
14160 return NULL;
14161 }
14162 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014163}
14164
INADA Naoki3ae20562017-01-16 20:41:20 +090014165/*[clinic input]
14166str.__sizeof__ as unicode_sizeof
14167
14168Return the size of the string in memory, in bytes.
14169[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014170
14171static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014172unicode_sizeof_impl(PyObject *self)
14173/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014174{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014175 Py_ssize_t size;
14176
14177 /* If it's a compact object, account for base structure +
14178 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014179 if (PyUnicode_IS_COMPACT_ASCII(self))
14180 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14181 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014182 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014183 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014184 else {
14185 /* If it is a two-block object, account for base object, and
14186 for character block if present. */
14187 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014188 if (_PyUnicode_DATA_ANY(self))
14189 size += (PyUnicode_GET_LENGTH(self) + 1) *
14190 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014191 }
14192 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014193 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014194 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14195 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14196 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14197 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014198
14199 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014200}
14201
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014202static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014203unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014204{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014205 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014206 if (!copy)
14207 return NULL;
14208 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014209}
14210
Guido van Rossumd57fd912000-03-10 22:53:23 +000014211static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014212 UNICODE_ENCODE_METHODDEF
14213 UNICODE_REPLACE_METHODDEF
14214 UNICODE_SPLIT_METHODDEF
14215 UNICODE_RSPLIT_METHODDEF
14216 UNICODE_JOIN_METHODDEF
14217 UNICODE_CAPITALIZE_METHODDEF
14218 UNICODE_CASEFOLD_METHODDEF
14219 UNICODE_TITLE_METHODDEF
14220 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014221 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014222 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014223 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014224 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014225 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014226 UNICODE_LJUST_METHODDEF
14227 UNICODE_LOWER_METHODDEF
14228 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014229 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14230 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014231 UNICODE_RJUST_METHODDEF
14232 UNICODE_RSTRIP_METHODDEF
14233 UNICODE_RPARTITION_METHODDEF
14234 UNICODE_SPLITLINES_METHODDEF
14235 UNICODE_STRIP_METHODDEF
14236 UNICODE_SWAPCASE_METHODDEF
14237 UNICODE_TRANSLATE_METHODDEF
14238 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014239 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14240 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014241 UNICODE_REMOVEPREFIX_METHODDEF
14242 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014243 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014244 UNICODE_ISLOWER_METHODDEF
14245 UNICODE_ISUPPER_METHODDEF
14246 UNICODE_ISTITLE_METHODDEF
14247 UNICODE_ISSPACE_METHODDEF
14248 UNICODE_ISDECIMAL_METHODDEF
14249 UNICODE_ISDIGIT_METHODDEF
14250 UNICODE_ISNUMERIC_METHODDEF
14251 UNICODE_ISALPHA_METHODDEF
14252 UNICODE_ISALNUM_METHODDEF
14253 UNICODE_ISIDENTIFIER_METHODDEF
14254 UNICODE_ISPRINTABLE_METHODDEF
14255 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014256 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014257 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014258 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014259 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014260 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014261#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014262 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014263 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014264#endif
14265
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014266 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014267 {NULL, NULL}
14268};
14269
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014270static PyObject *
14271unicode_mod(PyObject *v, PyObject *w)
14272{
Brian Curtindfc80e32011-08-10 20:28:54 -050014273 if (!PyUnicode_Check(v))
14274 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014275 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014276}
14277
14278static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014279 0, /*nb_add*/
14280 0, /*nb_subtract*/
14281 0, /*nb_multiply*/
14282 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014283};
14284
Guido van Rossumd57fd912000-03-10 22:53:23 +000014285static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 (lenfunc) unicode_length, /* sq_length */
14287 PyUnicode_Concat, /* sq_concat */
14288 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14289 (ssizeargfunc) unicode_getitem, /* sq_item */
14290 0, /* sq_slice */
14291 0, /* sq_ass_item */
14292 0, /* sq_ass_slice */
14293 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014294};
14295
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014296static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014297unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014299 if (PyUnicode_READY(self) == -1)
14300 return NULL;
14301
Victor Stinnera15e2602020-04-08 02:01:56 +020014302 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014303 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014304 if (i == -1 && PyErr_Occurred())
14305 return NULL;
14306 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014307 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014308 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014309 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014310 Py_ssize_t start, stop, step, slicelength, i;
14311 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014312 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014313 const void *src_data;
14314 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014315 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014316 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014317
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014318 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014319 return NULL;
14320 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014321 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14322 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014323
14324 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014325 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014326 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014327 slicelength == PyUnicode_GET_LENGTH(self)) {
14328 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014329 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014330 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014331 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014332 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014333 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014334 src_kind = PyUnicode_KIND(self);
14335 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014336 if (!PyUnicode_IS_ASCII(self)) {
14337 kind_limit = kind_maxchar_limit(src_kind);
14338 max_char = 0;
14339 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14340 ch = PyUnicode_READ(src_kind, src_data, cur);
14341 if (ch > max_char) {
14342 max_char = ch;
14343 if (max_char >= kind_limit)
14344 break;
14345 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014346 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014347 }
Victor Stinner55c99112011-10-13 01:17:06 +020014348 else
14349 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014350 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014351 if (result == NULL)
14352 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014353 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014354 dest_data = PyUnicode_DATA(result);
14355
14356 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014357 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14358 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014359 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014360 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014361 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014362 } else {
14363 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14364 return NULL;
14365 }
14366}
14367
14368static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014369 (lenfunc)unicode_length, /* mp_length */
14370 (binaryfunc)unicode_subscript, /* mp_subscript */
14371 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014372};
14373
Guido van Rossumd57fd912000-03-10 22:53:23 +000014374
Guido van Rossumd57fd912000-03-10 22:53:23 +000014375/* Helpers for PyUnicode_Format() */
14376
Victor Stinnera47082312012-10-04 02:19:54 +020014377struct unicode_formatter_t {
14378 PyObject *args;
14379 int args_owned;
14380 Py_ssize_t arglen, argidx;
14381 PyObject *dict;
14382
14383 enum PyUnicode_Kind fmtkind;
14384 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014385 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014386 PyObject *fmtstr;
14387
14388 _PyUnicodeWriter writer;
14389};
14390
14391struct unicode_format_arg_t {
14392 Py_UCS4 ch;
14393 int flags;
14394 Py_ssize_t width;
14395 int prec;
14396 int sign;
14397};
14398
Guido van Rossumd57fd912000-03-10 22:53:23 +000014399static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014400unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014401{
Victor Stinnera47082312012-10-04 02:19:54 +020014402 Py_ssize_t argidx = ctx->argidx;
14403
14404 if (argidx < ctx->arglen) {
14405 ctx->argidx++;
14406 if (ctx->arglen < 0)
14407 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014408 else
Victor Stinnera47082312012-10-04 02:19:54 +020014409 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014410 }
14411 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014412 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014413 return NULL;
14414}
14415
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014416/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014417
Victor Stinnera47082312012-10-04 02:19:54 +020014418/* Format a float into the writer if the writer is not NULL, or into *p_output
14419 otherwise.
14420
14421 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014422static int
Victor Stinnera47082312012-10-04 02:19:54 +020014423formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14424 PyObject **p_output,
14425 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014426{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014427 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014428 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014429 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014430 int prec;
14431 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014432
Guido van Rossumd57fd912000-03-10 22:53:23 +000014433 x = PyFloat_AsDouble(v);
14434 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014435 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014436
Victor Stinnera47082312012-10-04 02:19:54 +020014437 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014438 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014439 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014440
Victor Stinnera47082312012-10-04 02:19:54 +020014441 if (arg->flags & F_ALT)
14442 dtoa_flags = Py_DTSF_ALT;
14443 else
14444 dtoa_flags = 0;
14445 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014446 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014447 return -1;
14448 len = strlen(p);
14449 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014450 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014451 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014452 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014453 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014454 }
14455 else
14456 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014457 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014458 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014459}
14460
Victor Stinnerd0880d52012-04-27 23:40:13 +020014461/* formatlong() emulates the format codes d, u, o, x and X, and
14462 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14463 * Python's regular ints.
14464 * Return value: a new PyUnicodeObject*, or NULL if error.
14465 * The output string is of the form
14466 * "-"? ("0x" | "0X")? digit+
14467 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14468 * set in flags. The case of hex digits will be correct,
14469 * There will be at least prec digits, zero-filled on the left if
14470 * necessary to get that many.
14471 * val object to be converted
14472 * flags bitmask of format flags; only F_ALT is looked at
14473 * prec minimum number of digits; 0-fill on left if needed
14474 * type a character in [duoxX]; u acts the same as d
14475 *
14476 * CAUTION: o, x and X conversions on regular ints can never
14477 * produce a '-' sign, but can for Python's unbounded ints.
14478 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014479PyObject *
14480_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014481{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014482 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014483 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014484 Py_ssize_t i;
14485 int sign; /* 1 if '-', else 0 */
14486 int len; /* number of characters */
14487 Py_ssize_t llen;
14488 int numdigits; /* len == numnondigits + numdigits */
14489 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014490
Victor Stinnerd0880d52012-04-27 23:40:13 +020014491 /* Avoid exceeding SSIZE_T_MAX */
14492 if (prec > INT_MAX-3) {
14493 PyErr_SetString(PyExc_OverflowError,
14494 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014495 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014496 }
14497
14498 assert(PyLong_Check(val));
14499
14500 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014501 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014502 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014503 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014504 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014505 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014506 /* int and int subclasses should print numerically when a numeric */
14507 /* format code is used (see issue18780) */
14508 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014509 break;
14510 case 'o':
14511 numnondigits = 2;
14512 result = PyNumber_ToBase(val, 8);
14513 break;
14514 case 'x':
14515 case 'X':
14516 numnondigits = 2;
14517 result = PyNumber_ToBase(val, 16);
14518 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014519 }
14520 if (!result)
14521 return NULL;
14522
14523 assert(unicode_modifiable(result));
14524 assert(PyUnicode_IS_READY(result));
14525 assert(PyUnicode_IS_ASCII(result));
14526
14527 /* To modify the string in-place, there can only be one reference. */
14528 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014529 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014530 PyErr_BadInternalCall();
14531 return NULL;
14532 }
14533 buf = PyUnicode_DATA(result);
14534 llen = PyUnicode_GET_LENGTH(result);
14535 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014536 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014537 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014538 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014539 return NULL;
14540 }
14541 len = (int)llen;
14542 sign = buf[0] == '-';
14543 numnondigits += sign;
14544 numdigits = len - numnondigits;
14545 assert(numdigits > 0);
14546
14547 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014548 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014549 (type == 'o' || type == 'x' || type == 'X'))) {
14550 assert(buf[sign] == '0');
14551 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14552 buf[sign+1] == 'o');
14553 numnondigits -= 2;
14554 buf += 2;
14555 len -= 2;
14556 if (sign)
14557 buf[0] = '-';
14558 assert(len == numnondigits + numdigits);
14559 assert(numdigits > 0);
14560 }
14561
14562 /* Fill with leading zeroes to meet minimum width. */
14563 if (prec > numdigits) {
14564 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14565 numnondigits + prec);
14566 char *b1;
14567 if (!r1) {
14568 Py_DECREF(result);
14569 return NULL;
14570 }
14571 b1 = PyBytes_AS_STRING(r1);
14572 for (i = 0; i < numnondigits; ++i)
14573 *b1++ = *buf++;
14574 for (i = 0; i < prec - numdigits; i++)
14575 *b1++ = '0';
14576 for (i = 0; i < numdigits; i++)
14577 *b1++ = *buf++;
14578 *b1 = '\0';
14579 Py_DECREF(result);
14580 result = r1;
14581 buf = PyBytes_AS_STRING(result);
14582 len = numnondigits + prec;
14583 }
14584
14585 /* Fix up case for hex conversions. */
14586 if (type == 'X') {
14587 /* Need to convert all lower case letters to upper case.
14588 and need to convert 0x to 0X (and -0x to -0X). */
14589 for (i = 0; i < len; i++)
14590 if (buf[i] >= 'a' && buf[i] <= 'x')
14591 buf[i] -= 'a'-'A';
14592 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014593 if (!PyUnicode_Check(result)
14594 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014595 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014596 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014597 Py_DECREF(result);
14598 result = unicode;
14599 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014600 else if (len != PyUnicode_GET_LENGTH(result)) {
14601 if (PyUnicode_Resize(&result, len) < 0)
14602 Py_CLEAR(result);
14603 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014604 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014605}
14606
Ethan Furmandf3ed242014-01-05 06:50:30 -080014607/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014608 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014609 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014610 * -1 and raise an exception on error */
14611static int
Victor Stinnera47082312012-10-04 02:19:54 +020014612mainformatlong(PyObject *v,
14613 struct unicode_format_arg_t *arg,
14614 PyObject **p_output,
14615 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014616{
14617 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014618 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014619
14620 if (!PyNumber_Check(v))
14621 goto wrongtype;
14622
Ethan Furman9ab74802014-03-21 06:38:46 -070014623 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014624 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014625 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014626 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014627 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014628 if (PyErr_ExceptionMatches(PyExc_TypeError))
14629 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014630 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014631 }
14632 }
14633 else {
14634 iobj = PyNumber_Long(v);
14635 if (iobj == NULL ) {
14636 if (PyErr_ExceptionMatches(PyExc_TypeError))
14637 goto wrongtype;
14638 return -1;
14639 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014640 }
14641 assert(PyLong_Check(iobj));
14642 }
14643 else {
14644 iobj = v;
14645 Py_INCREF(iobj);
14646 }
14647
14648 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014649 && arg->width == -1 && arg->prec == -1
14650 && !(arg->flags & (F_SIGN | F_BLANK))
14651 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014652 {
14653 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014654 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014655 int base;
14656
Victor Stinnera47082312012-10-04 02:19:54 +020014657 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014658 {
14659 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014660 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014661 case 'd':
14662 case 'i':
14663 case 'u':
14664 base = 10;
14665 break;
14666 case 'o':
14667 base = 8;
14668 break;
14669 case 'x':
14670 case 'X':
14671 base = 16;
14672 break;
14673 }
14674
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014675 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14676 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014677 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014678 }
14679 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014680 return 1;
14681 }
14682
Ethan Furmanb95b5612015-01-23 20:05:18 -080014683 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014684 Py_DECREF(iobj);
14685 if (res == NULL)
14686 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014687 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014688 return 0;
14689
14690wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014691 switch(type)
14692 {
14693 case 'o':
14694 case 'x':
14695 case 'X':
14696 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014697 "%%%c format: an integer is required, "
14698 "not %.200s",
14699 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014700 break;
14701 default:
14702 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014703 "%%%c format: a number is required, "
14704 "not %.200s",
14705 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014706 break;
14707 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014708 return -1;
14709}
14710
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014711static Py_UCS4
14712formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014713{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014714 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014715 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014716 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014717 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014718 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014719 goto onError;
14720 }
14721 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014722 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014723 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014724 /* make sure number is a type of integer */
14725 if (!PyLong_Check(v)) {
14726 iobj = PyNumber_Index(v);
14727 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014728 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014729 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014730 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014731 Py_DECREF(iobj);
14732 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014733 else {
14734 x = PyLong_AsLong(v);
14735 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014736 if (x == -1 && PyErr_Occurred())
14737 goto onError;
14738
Victor Stinner8faf8212011-12-08 22:14:11 +010014739 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014740 PyErr_SetString(PyExc_OverflowError,
14741 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014742 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014743 }
14744
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014745 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014746 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014747
Benjamin Peterson29060642009-01-31 22:14:21 +000014748 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014749 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014750 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014751 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014752}
14753
Victor Stinnera47082312012-10-04 02:19:54 +020014754/* Parse options of an argument: flags, width, precision.
14755 Handle also "%(name)" syntax.
14756
14757 Return 0 if the argument has been formatted into arg->str.
14758 Return 1 if the argument has been written into ctx->writer,
14759 Raise an exception and return -1 on error. */
14760static int
14761unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14762 struct unicode_format_arg_t *arg)
14763{
14764#define FORMAT_READ(ctx) \
14765 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14766
14767 PyObject *v;
14768
Victor Stinnera47082312012-10-04 02:19:54 +020014769 if (arg->ch == '(') {
14770 /* Get argument value from a dictionary. Example: "%(name)s". */
14771 Py_ssize_t keystart;
14772 Py_ssize_t keylen;
14773 PyObject *key;
14774 int pcount = 1;
14775
14776 if (ctx->dict == NULL) {
14777 PyErr_SetString(PyExc_TypeError,
14778 "format requires a mapping");
14779 return -1;
14780 }
14781 ++ctx->fmtpos;
14782 --ctx->fmtcnt;
14783 keystart = ctx->fmtpos;
14784 /* Skip over balanced parentheses */
14785 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14786 arg->ch = FORMAT_READ(ctx);
14787 if (arg->ch == ')')
14788 --pcount;
14789 else if (arg->ch == '(')
14790 ++pcount;
14791 ctx->fmtpos++;
14792 }
14793 keylen = ctx->fmtpos - keystart - 1;
14794 if (ctx->fmtcnt < 0 || pcount > 0) {
14795 PyErr_SetString(PyExc_ValueError,
14796 "incomplete format key");
14797 return -1;
14798 }
14799 key = PyUnicode_Substring(ctx->fmtstr,
14800 keystart, keystart + keylen);
14801 if (key == NULL)
14802 return -1;
14803 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014804 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014805 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014806 }
14807 ctx->args = PyObject_GetItem(ctx->dict, key);
14808 Py_DECREF(key);
14809 if (ctx->args == NULL)
14810 return -1;
14811 ctx->args_owned = 1;
14812 ctx->arglen = -1;
14813 ctx->argidx = -2;
14814 }
14815
14816 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014817 while (--ctx->fmtcnt >= 0) {
14818 arg->ch = FORMAT_READ(ctx);
14819 ctx->fmtpos++;
14820 switch (arg->ch) {
14821 case '-': arg->flags |= F_LJUST; continue;
14822 case '+': arg->flags |= F_SIGN; continue;
14823 case ' ': arg->flags |= F_BLANK; continue;
14824 case '#': arg->flags |= F_ALT; continue;
14825 case '0': arg->flags |= F_ZERO; continue;
14826 }
14827 break;
14828 }
14829
14830 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014831 if (arg->ch == '*') {
14832 v = unicode_format_getnextarg(ctx);
14833 if (v == NULL)
14834 return -1;
14835 if (!PyLong_Check(v)) {
14836 PyErr_SetString(PyExc_TypeError,
14837 "* wants int");
14838 return -1;
14839 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014840 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014841 if (arg->width == -1 && PyErr_Occurred())
14842 return -1;
14843 if (arg->width < 0) {
14844 arg->flags |= F_LJUST;
14845 arg->width = -arg->width;
14846 }
14847 if (--ctx->fmtcnt >= 0) {
14848 arg->ch = FORMAT_READ(ctx);
14849 ctx->fmtpos++;
14850 }
14851 }
14852 else if (arg->ch >= '0' && arg->ch <= '9') {
14853 arg->width = arg->ch - '0';
14854 while (--ctx->fmtcnt >= 0) {
14855 arg->ch = FORMAT_READ(ctx);
14856 ctx->fmtpos++;
14857 if (arg->ch < '0' || arg->ch > '9')
14858 break;
14859 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14860 mixing signed and unsigned comparison. Since arg->ch is between
14861 '0' and '9', casting to int is safe. */
14862 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14863 PyErr_SetString(PyExc_ValueError,
14864 "width too big");
14865 return -1;
14866 }
14867 arg->width = arg->width*10 + (arg->ch - '0');
14868 }
14869 }
14870
14871 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014872 if (arg->ch == '.') {
14873 arg->prec = 0;
14874 if (--ctx->fmtcnt >= 0) {
14875 arg->ch = FORMAT_READ(ctx);
14876 ctx->fmtpos++;
14877 }
14878 if (arg->ch == '*') {
14879 v = unicode_format_getnextarg(ctx);
14880 if (v == NULL)
14881 return -1;
14882 if (!PyLong_Check(v)) {
14883 PyErr_SetString(PyExc_TypeError,
14884 "* wants int");
14885 return -1;
14886 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014887 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014888 if (arg->prec == -1 && PyErr_Occurred())
14889 return -1;
14890 if (arg->prec < 0)
14891 arg->prec = 0;
14892 if (--ctx->fmtcnt >= 0) {
14893 arg->ch = FORMAT_READ(ctx);
14894 ctx->fmtpos++;
14895 }
14896 }
14897 else if (arg->ch >= '0' && arg->ch <= '9') {
14898 arg->prec = arg->ch - '0';
14899 while (--ctx->fmtcnt >= 0) {
14900 arg->ch = FORMAT_READ(ctx);
14901 ctx->fmtpos++;
14902 if (arg->ch < '0' || arg->ch > '9')
14903 break;
14904 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14905 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014906 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014907 return -1;
14908 }
14909 arg->prec = arg->prec*10 + (arg->ch - '0');
14910 }
14911 }
14912 }
14913
14914 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14915 if (ctx->fmtcnt >= 0) {
14916 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14917 if (--ctx->fmtcnt >= 0) {
14918 arg->ch = FORMAT_READ(ctx);
14919 ctx->fmtpos++;
14920 }
14921 }
14922 }
14923 if (ctx->fmtcnt < 0) {
14924 PyErr_SetString(PyExc_ValueError,
14925 "incomplete format");
14926 return -1;
14927 }
14928 return 0;
14929
14930#undef FORMAT_READ
14931}
14932
14933/* Format one argument. Supported conversion specifiers:
14934
14935 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014936 - "i", "d", "u": int or float
14937 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014938 - "e", "E", "f", "F", "g", "G": float
14939 - "c": int or str (1 character)
14940
Victor Stinner8dbd4212012-12-04 09:30:24 +010014941 When possible, the output is written directly into the Unicode writer
14942 (ctx->writer). A string is created when padding is required.
14943
Victor Stinnera47082312012-10-04 02:19:54 +020014944 Return 0 if the argument has been formatted into *p_str,
14945 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014946 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014947static int
14948unicode_format_arg_format(struct unicode_formatter_t *ctx,
14949 struct unicode_format_arg_t *arg,
14950 PyObject **p_str)
14951{
14952 PyObject *v;
14953 _PyUnicodeWriter *writer = &ctx->writer;
14954
14955 if (ctx->fmtcnt == 0)
14956 ctx->writer.overallocate = 0;
14957
Victor Stinnera47082312012-10-04 02:19:54 +020014958 v = unicode_format_getnextarg(ctx);
14959 if (v == NULL)
14960 return -1;
14961
Victor Stinnera47082312012-10-04 02:19:54 +020014962
14963 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014964 case 's':
14965 case 'r':
14966 case 'a':
14967 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14968 /* Fast path */
14969 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14970 return -1;
14971 return 1;
14972 }
14973
14974 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14975 *p_str = v;
14976 Py_INCREF(*p_str);
14977 }
14978 else {
14979 if (arg->ch == 's')
14980 *p_str = PyObject_Str(v);
14981 else if (arg->ch == 'r')
14982 *p_str = PyObject_Repr(v);
14983 else
14984 *p_str = PyObject_ASCII(v);
14985 }
14986 break;
14987
14988 case 'i':
14989 case 'd':
14990 case 'u':
14991 case 'o':
14992 case 'x':
14993 case 'X':
14994 {
14995 int ret = mainformatlong(v, arg, p_str, writer);
14996 if (ret != 0)
14997 return ret;
14998 arg->sign = 1;
14999 break;
15000 }
15001
15002 case 'e':
15003 case 'E':
15004 case 'f':
15005 case 'F':
15006 case 'g':
15007 case 'G':
15008 if (arg->width == -1 && arg->prec == -1
15009 && !(arg->flags & (F_SIGN | F_BLANK)))
15010 {
15011 /* Fast path */
15012 if (formatfloat(v, arg, NULL, writer) == -1)
15013 return -1;
15014 return 1;
15015 }
15016
15017 arg->sign = 1;
15018 if (formatfloat(v, arg, p_str, NULL) == -1)
15019 return -1;
15020 break;
15021
15022 case 'c':
15023 {
15024 Py_UCS4 ch = formatchar(v);
15025 if (ch == (Py_UCS4) -1)
15026 return -1;
15027 if (arg->width == -1 && arg->prec == -1) {
15028 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015029 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015030 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015031 return 1;
15032 }
15033 *p_str = PyUnicode_FromOrdinal(ch);
15034 break;
15035 }
15036
15037 default:
15038 PyErr_Format(PyExc_ValueError,
15039 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015040 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015041 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15042 (int)arg->ch,
15043 ctx->fmtpos - 1);
15044 return -1;
15045 }
15046 if (*p_str == NULL)
15047 return -1;
15048 assert (PyUnicode_Check(*p_str));
15049 return 0;
15050}
15051
15052static int
15053unicode_format_arg_output(struct unicode_formatter_t *ctx,
15054 struct unicode_format_arg_t *arg,
15055 PyObject *str)
15056{
15057 Py_ssize_t len;
15058 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015059 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015060 Py_ssize_t pindex;
15061 Py_UCS4 signchar;
15062 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015063 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015064 Py_ssize_t sublen;
15065 _PyUnicodeWriter *writer = &ctx->writer;
15066 Py_UCS4 fill;
15067
15068 fill = ' ';
15069 if (arg->sign && arg->flags & F_ZERO)
15070 fill = '0';
15071
15072 if (PyUnicode_READY(str) == -1)
15073 return -1;
15074
15075 len = PyUnicode_GET_LENGTH(str);
15076 if ((arg->width == -1 || arg->width <= len)
15077 && (arg->prec == -1 || arg->prec >= len)
15078 && !(arg->flags & (F_SIGN | F_BLANK)))
15079 {
15080 /* Fast path */
15081 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15082 return -1;
15083 return 0;
15084 }
15085
15086 /* Truncate the string for "s", "r" and "a" formats
15087 if the precision is set */
15088 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15089 if (arg->prec >= 0 && len > arg->prec)
15090 len = arg->prec;
15091 }
15092
15093 /* Adjust sign and width */
15094 kind = PyUnicode_KIND(str);
15095 pbuf = PyUnicode_DATA(str);
15096 pindex = 0;
15097 signchar = '\0';
15098 if (arg->sign) {
15099 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15100 if (ch == '-' || ch == '+') {
15101 signchar = ch;
15102 len--;
15103 pindex++;
15104 }
15105 else if (arg->flags & F_SIGN)
15106 signchar = '+';
15107 else if (arg->flags & F_BLANK)
15108 signchar = ' ';
15109 else
15110 arg->sign = 0;
15111 }
15112 if (arg->width < len)
15113 arg->width = len;
15114
15115 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015116 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015117 if (!(arg->flags & F_LJUST)) {
15118 if (arg->sign) {
15119 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015120 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015121 }
15122 else {
15123 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015124 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015125 }
15126 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015127 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15128 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015129 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015130 }
15131
Victor Stinnera47082312012-10-04 02:19:54 +020015132 buflen = arg->width;
15133 if (arg->sign && len == arg->width)
15134 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015135 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015136 return -1;
15137
15138 /* Write the sign if needed */
15139 if (arg->sign) {
15140 if (fill != ' ') {
15141 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15142 writer->pos += 1;
15143 }
15144 if (arg->width > len)
15145 arg->width--;
15146 }
15147
15148 /* Write the numeric prefix for "x", "X" and "o" formats
15149 if the alternate form is used.
15150 For example, write "0x" for the "%#x" format. */
15151 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15152 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15153 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15154 if (fill != ' ') {
15155 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15156 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15157 writer->pos += 2;
15158 pindex += 2;
15159 }
15160 arg->width -= 2;
15161 if (arg->width < 0)
15162 arg->width = 0;
15163 len -= 2;
15164 }
15165
15166 /* Pad left with the fill character if needed */
15167 if (arg->width > len && !(arg->flags & F_LJUST)) {
15168 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015169 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015170 writer->pos += sublen;
15171 arg->width = len;
15172 }
15173
15174 /* If padding with spaces: write sign if needed and/or numeric prefix if
15175 the alternate form is used */
15176 if (fill == ' ') {
15177 if (arg->sign) {
15178 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15179 writer->pos += 1;
15180 }
15181 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15182 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15183 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15184 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15185 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15186 writer->pos += 2;
15187 pindex += 2;
15188 }
15189 }
15190
15191 /* Write characters */
15192 if (len) {
15193 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15194 str, pindex, len);
15195 writer->pos += len;
15196 }
15197
15198 /* Pad right with the fill character if needed */
15199 if (arg->width > len) {
15200 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015201 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015202 writer->pos += sublen;
15203 }
15204 return 0;
15205}
15206
15207/* Helper of PyUnicode_Format(): format one arg.
15208 Return 0 on success, raise an exception and return -1 on error. */
15209static int
15210unicode_format_arg(struct unicode_formatter_t *ctx)
15211{
15212 struct unicode_format_arg_t arg;
15213 PyObject *str;
15214 int ret;
15215
Victor Stinner8dbd4212012-12-04 09:30:24 +010015216 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015217 if (arg.ch == '%') {
15218 ctx->fmtpos++;
15219 ctx->fmtcnt--;
15220 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15221 return -1;
15222 return 0;
15223 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015224 arg.flags = 0;
15225 arg.width = -1;
15226 arg.prec = -1;
15227 arg.sign = 0;
15228 str = NULL;
15229
Victor Stinnera47082312012-10-04 02:19:54 +020015230 ret = unicode_format_arg_parse(ctx, &arg);
15231 if (ret == -1)
15232 return -1;
15233
15234 ret = unicode_format_arg_format(ctx, &arg, &str);
15235 if (ret == -1)
15236 return -1;
15237
15238 if (ret != 1) {
15239 ret = unicode_format_arg_output(ctx, &arg, str);
15240 Py_DECREF(str);
15241 if (ret == -1)
15242 return -1;
15243 }
15244
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015245 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015246 PyErr_SetString(PyExc_TypeError,
15247 "not all arguments converted during string formatting");
15248 return -1;
15249 }
15250 return 0;
15251}
15252
Alexander Belopolsky40018472011-02-26 01:02:56 +000015253PyObject *
15254PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015255{
Victor Stinnera47082312012-10-04 02:19:54 +020015256 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015257
Guido van Rossumd57fd912000-03-10 22:53:23 +000015258 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015259 PyErr_BadInternalCall();
15260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015261 }
Victor Stinnera47082312012-10-04 02:19:54 +020015262
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015263 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015264 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015265
15266 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015267 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15268 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15269 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15270 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015271
Victor Stinner8f674cc2013-04-17 23:02:17 +020015272 _PyUnicodeWriter_Init(&ctx.writer);
15273 ctx.writer.min_length = ctx.fmtcnt + 100;
15274 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015275
Guido van Rossumd57fd912000-03-10 22:53:23 +000015276 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015277 ctx.arglen = PyTuple_Size(args);
15278 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015279 }
15280 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015281 ctx.arglen = -1;
15282 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015283 }
Victor Stinnera47082312012-10-04 02:19:54 +020015284 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015285 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015286 ctx.dict = args;
15287 else
15288 ctx.dict = NULL;
15289 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015290
Victor Stinnera47082312012-10-04 02:19:54 +020015291 while (--ctx.fmtcnt >= 0) {
15292 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015293 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015294
15295 nonfmtpos = ctx.fmtpos++;
15296 while (ctx.fmtcnt >= 0 &&
15297 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15298 ctx.fmtpos++;
15299 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015300 }
Victor Stinnera47082312012-10-04 02:19:54 +020015301 if (ctx.fmtcnt < 0) {
15302 ctx.fmtpos--;
15303 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015304 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015305
Victor Stinnercfc4c132013-04-03 01:48:39 +020015306 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15307 nonfmtpos, ctx.fmtpos) < 0)
15308 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015309 }
15310 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015311 ctx.fmtpos++;
15312 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015313 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015314 }
15315 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015316
Victor Stinnera47082312012-10-04 02:19:54 +020015317 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015318 PyErr_SetString(PyExc_TypeError,
15319 "not all arguments converted during string formatting");
15320 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015321 }
15322
Victor Stinnera47082312012-10-04 02:19:54 +020015323 if (ctx.args_owned) {
15324 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015325 }
Victor Stinnera47082312012-10-04 02:19:54 +020015326 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015327
Benjamin Peterson29060642009-01-31 22:14:21 +000015328 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015329 _PyUnicodeWriter_Dealloc(&ctx.writer);
15330 if (ctx.args_owned) {
15331 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015332 }
15333 return NULL;
15334}
15335
Jeremy Hylton938ace62002-07-17 16:30:39 +000015336static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015337unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15338
Tim Peters6d6c1a32001-08-02 04:15:00 +000015339static PyObject *
15340unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15341{
Benjamin Peterson29060642009-01-31 22:14:21 +000015342 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 static char *kwlist[] = {"object", "encoding", "errors", 0};
15344 char *encoding = NULL;
15345 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015346
Benjamin Peterson14339b62009-01-31 16:36:08 +000015347 if (type != &PyUnicode_Type)
15348 return unicode_subtype_new(type, args, kwds);
15349 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015350 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015351 return NULL;
15352 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015353 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015354 if (encoding == NULL && errors == NULL)
15355 return PyObject_Str(x);
15356 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015357 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015358}
15359
Guido van Rossume023fe02001-08-30 03:12:59 +000015360static PyObject *
15361unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15362{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015363 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015364 Py_ssize_t length, char_size;
15365 int share_wstr, share_utf8;
15366 unsigned int kind;
15367 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015368
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015370
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015371 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015372 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015374 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015375 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015376 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015377 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015378 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015379
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015380 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015381 if (self == NULL) {
15382 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015383 return NULL;
15384 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015385 kind = PyUnicode_KIND(unicode);
15386 length = PyUnicode_GET_LENGTH(unicode);
15387
15388 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015389#ifdef Py_DEBUG
15390 _PyUnicode_HASH(self) = -1;
15391#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015392 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015393#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015394 _PyUnicode_STATE(self).interned = 0;
15395 _PyUnicode_STATE(self).kind = kind;
15396 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015397 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015398 _PyUnicode_STATE(self).ready = 1;
15399 _PyUnicode_WSTR(self) = NULL;
15400 _PyUnicode_UTF8_LENGTH(self) = 0;
15401 _PyUnicode_UTF8(self) = NULL;
15402 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015403 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015404
15405 share_utf8 = 0;
15406 share_wstr = 0;
15407 if (kind == PyUnicode_1BYTE_KIND) {
15408 char_size = 1;
15409 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15410 share_utf8 = 1;
15411 }
15412 else if (kind == PyUnicode_2BYTE_KIND) {
15413 char_size = 2;
15414 if (sizeof(wchar_t) == 2)
15415 share_wstr = 1;
15416 }
15417 else {
15418 assert(kind == PyUnicode_4BYTE_KIND);
15419 char_size = 4;
15420 if (sizeof(wchar_t) == 4)
15421 share_wstr = 1;
15422 }
15423
15424 /* Ensure we won't overflow the length. */
15425 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15426 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015427 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015428 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015429 data = PyObject_MALLOC((length + 1) * char_size);
15430 if (data == NULL) {
15431 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015432 goto onError;
15433 }
15434
Victor Stinnerc3c74152011-10-02 20:39:55 +020015435 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015436 if (share_utf8) {
15437 _PyUnicode_UTF8_LENGTH(self) = length;
15438 _PyUnicode_UTF8(self) = data;
15439 }
15440 if (share_wstr) {
15441 _PyUnicode_WSTR_LENGTH(self) = length;
15442 _PyUnicode_WSTR(self) = (wchar_t *)data;
15443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015444
Christian Heimesf051e432016-09-13 20:22:02 +020015445 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015446 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015447 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015448#ifdef Py_DEBUG
15449 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15450#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015451 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015452 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015453
15454onError:
15455 Py_DECREF(unicode);
15456 Py_DECREF(self);
15457 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015458}
15459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015460PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015461"str(object='') -> str\n\
15462str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015463\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015464Create a new string object from the given object. If encoding or\n\
15465errors is specified, then the object must expose a data buffer\n\
15466that will be decoded using the given encoding and error handler.\n\
15467Otherwise, returns the result of object.__str__() (if defined)\n\
15468or repr(object).\n\
15469encoding defaults to sys.getdefaultencoding().\n\
15470errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015471
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015472static PyObject *unicode_iter(PyObject *seq);
15473
Guido van Rossumd57fd912000-03-10 22:53:23 +000015474PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015475 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015476 "str", /* tp_name */
15477 sizeof(PyUnicodeObject), /* tp_basicsize */
15478 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015479 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015480 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015481 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015482 0, /* tp_getattr */
15483 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015484 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015485 unicode_repr, /* tp_repr */
15486 &unicode_as_number, /* tp_as_number */
15487 &unicode_as_sequence, /* tp_as_sequence */
15488 &unicode_as_mapping, /* tp_as_mapping */
15489 (hashfunc) unicode_hash, /* tp_hash*/
15490 0, /* tp_call*/
15491 (reprfunc) unicode_str, /* tp_str */
15492 PyObject_GenericGetAttr, /* tp_getattro */
15493 0, /* tp_setattro */
15494 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015495 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015496 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15497 unicode_doc, /* tp_doc */
15498 0, /* tp_traverse */
15499 0, /* tp_clear */
15500 PyUnicode_RichCompare, /* tp_richcompare */
15501 0, /* tp_weaklistoffset */
15502 unicode_iter, /* tp_iter */
15503 0, /* tp_iternext */
15504 unicode_methods, /* tp_methods */
15505 0, /* tp_members */
15506 0, /* tp_getset */
15507 &PyBaseObject_Type, /* tp_base */
15508 0, /* tp_dict */
15509 0, /* tp_descr_get */
15510 0, /* tp_descr_set */
15511 0, /* tp_dictoffset */
15512 0, /* tp_init */
15513 0, /* tp_alloc */
15514 unicode_new, /* tp_new */
15515 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015516};
15517
15518/* Initialize the Unicode implementation */
15519
Victor Stinner331a6a52019-05-27 16:39:22 +020015520PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015521_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015522{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015523 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015524 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015525 0x000A, /* LINE FEED */
15526 0x000D, /* CARRIAGE RETURN */
15527 0x001C, /* FILE SEPARATOR */
15528 0x001D, /* GROUP SEPARATOR */
15529 0x001E, /* RECORD SEPARATOR */
15530 0x0085, /* NEXT LINE */
15531 0x2028, /* LINE SEPARATOR */
15532 0x2029, /* PARAGRAPH SEPARATOR */
15533 };
15534
Fred Drakee4315f52000-05-09 19:53:39 +000015535 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015536 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015537 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015538 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015539 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015540 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015541
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015542 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015543 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015544 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015545
15546 /* initialize the linebreak bloom filter */
15547 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015548 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015549 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015550
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015551 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015552 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015553 }
15554 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015555 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015556 }
15557 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015558 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015559 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015560 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015561}
15562
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015563
Walter Dörwald16807132007-05-25 13:52:07 +000015564void
15565PyUnicode_InternInPlace(PyObject **p)
15566{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015567 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015568#ifdef Py_DEBUG
15569 assert(s != NULL);
15570 assert(_PyUnicode_CHECK(s));
15571#else
Victor Stinner607b1022020-05-05 18:50:30 +020015572 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015573 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015574 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015575#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015576
Benjamin Peterson14339b62009-01-31 16:36:08 +000015577 /* If it's a subclass, we don't really know what putting
15578 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015579 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015580 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015581 }
15582
15583 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015584 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015585 }
15586
15587#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015588 if (interned == NULL) {
15589 interned = PyDict_New();
15590 if (interned == NULL) {
15591 PyErr_Clear(); /* Don't leave an exception */
15592 return;
15593 }
15594 }
Victor Stinner607b1022020-05-05 18:50:30 +020015595
15596 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015597 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015598 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015599 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015600
Berker Peksagced8d4c2016-07-25 04:40:39 +030015601 if (t == NULL) {
15602 PyErr_Clear();
15603 return;
15604 }
Victor Stinner607b1022020-05-05 18:50:30 +020015605
Berker Peksagced8d4c2016-07-25 04:40:39 +030015606 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015607 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015608 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015609 return;
15610 }
Victor Stinner607b1022020-05-05 18:50:30 +020015611
Benjamin Peterson14339b62009-01-31 16:36:08 +000015612 /* The two references in interned are not counted by refcnt.
15613 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015614 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015615 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015616#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015617}
15618
15619void
15620PyUnicode_InternImmortal(PyObject **p)
15621{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015622 PyUnicode_InternInPlace(p);
15623 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015624 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015625 Py_INCREF(*p);
15626 }
Walter Dörwald16807132007-05-25 13:52:07 +000015627}
15628
15629PyObject *
15630PyUnicode_InternFromString(const char *cp)
15631{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015632 PyObject *s = PyUnicode_FromString(cp);
15633 if (s == NULL)
15634 return NULL;
15635 PyUnicode_InternInPlace(&s);
15636 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015637}
15638
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015639
15640#if defined(WITH_VALGRIND) || defined(__INSURE__)
15641static void
15642unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015643{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015644 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015645 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015646 }
15647 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015648 if (keys == NULL || !PyList_Check(keys)) {
15649 PyErr_Clear();
15650 return;
15651 }
Walter Dörwald16807132007-05-25 13:52:07 +000015652
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015653 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015654 detector, interned unicode strings are not forcibly deallocated;
15655 rather, we give them their stolen references back, and then clear
15656 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015657
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015658 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015659#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015660 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015661
15662 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015663#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015664 for (Py_ssize_t i = 0; i < n; i++) {
15665 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015666 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015667 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015669 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015670 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015671 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015672#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015673 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015674#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015675 break;
15676 case SSTATE_INTERNED_MORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015677 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015678#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015679 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015680#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015681 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015682 case SSTATE_NOT_INTERNED:
15683 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015684 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015685 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015687 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015688 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015689#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015690 fprintf(stderr,
15691 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15692 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015693#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015694 Py_DECREF(keys);
15695 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015696 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015697}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015698#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015699
15700
15701/********************* Unicode Iterator **************************/
15702
15703typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015704 PyObject_HEAD
15705 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015706 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015707} unicodeiterobject;
15708
15709static void
15710unicodeiter_dealloc(unicodeiterobject *it)
15711{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015712 _PyObject_GC_UNTRACK(it);
15713 Py_XDECREF(it->it_seq);
15714 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015715}
15716
15717static int
15718unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15719{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015720 Py_VISIT(it->it_seq);
15721 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015722}
15723
15724static PyObject *
15725unicodeiter_next(unicodeiterobject *it)
15726{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015727 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015728
Benjamin Peterson14339b62009-01-31 16:36:08 +000015729 assert(it != NULL);
15730 seq = it->it_seq;
15731 if (seq == NULL)
15732 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015733 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015735 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15736 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015737 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015738 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15739 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015740 if (item != NULL)
15741 ++it->it_index;
15742 return item;
15743 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015744
Benjamin Peterson14339b62009-01-31 16:36:08 +000015745 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015746 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015747 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015748}
15749
15750static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015751unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015752{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015753 Py_ssize_t len = 0;
15754 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015755 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015756 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015757}
15758
15759PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15760
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015761static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015762unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015763{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015764 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015765 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015766 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015767 it->it_seq, it->it_index);
15768 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015769 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015770 if (u == NULL)
15771 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015772 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015773 }
15774}
15775
15776PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15777
15778static PyObject *
15779unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15780{
15781 Py_ssize_t index = PyLong_AsSsize_t(state);
15782 if (index == -1 && PyErr_Occurred())
15783 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015784 if (it->it_seq != NULL) {
15785 if (index < 0)
15786 index = 0;
15787 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15788 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15789 it->it_index = index;
15790 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015791 Py_RETURN_NONE;
15792}
15793
15794PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15795
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015796static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015797 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015798 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015799 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15800 reduce_doc},
15801 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15802 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015803 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015804};
15805
15806PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015807 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15808 "str_iterator", /* tp_name */
15809 sizeof(unicodeiterobject), /* tp_basicsize */
15810 0, /* tp_itemsize */
15811 /* methods */
15812 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015813 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015814 0, /* tp_getattr */
15815 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015816 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015817 0, /* tp_repr */
15818 0, /* tp_as_number */
15819 0, /* tp_as_sequence */
15820 0, /* tp_as_mapping */
15821 0, /* tp_hash */
15822 0, /* tp_call */
15823 0, /* tp_str */
15824 PyObject_GenericGetAttr, /* tp_getattro */
15825 0, /* tp_setattro */
15826 0, /* tp_as_buffer */
15827 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15828 0, /* tp_doc */
15829 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15830 0, /* tp_clear */
15831 0, /* tp_richcompare */
15832 0, /* tp_weaklistoffset */
15833 PyObject_SelfIter, /* tp_iter */
15834 (iternextfunc)unicodeiter_next, /* tp_iternext */
15835 unicodeiter_methods, /* tp_methods */
15836 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015837};
15838
15839static PyObject *
15840unicode_iter(PyObject *seq)
15841{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015842 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015843
Benjamin Peterson14339b62009-01-31 16:36:08 +000015844 if (!PyUnicode_Check(seq)) {
15845 PyErr_BadInternalCall();
15846 return NULL;
15847 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015848 if (PyUnicode_READY(seq) == -1)
15849 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015850 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15851 if (it == NULL)
15852 return NULL;
15853 it->it_index = 0;
15854 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015855 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015856 _PyObject_GC_TRACK(it);
15857 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015858}
15859
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015860
15861size_t
15862Py_UNICODE_strlen(const Py_UNICODE *u)
15863{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015864 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015865}
15866
15867Py_UNICODE*
15868Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15869{
15870 Py_UNICODE *u = s1;
15871 while ((*u++ = *s2++));
15872 return s1;
15873}
15874
15875Py_UNICODE*
15876Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15877{
15878 Py_UNICODE *u = s1;
15879 while ((*u++ = *s2++))
15880 if (n-- == 0)
15881 break;
15882 return s1;
15883}
15884
15885Py_UNICODE*
15886Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15887{
15888 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015889 u1 += wcslen(u1);
15890 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015891 return s1;
15892}
15893
15894int
15895Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15896{
15897 while (*s1 && *s2 && *s1 == *s2)
15898 s1++, s2++;
15899 if (*s1 && *s2)
15900 return (*s1 < *s2) ? -1 : +1;
15901 if (*s1)
15902 return 1;
15903 if (*s2)
15904 return -1;
15905 return 0;
15906}
15907
15908int
15909Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15910{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015911 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015912 for (; n != 0; n--) {
15913 u1 = *s1;
15914 u2 = *s2;
15915 if (u1 != u2)
15916 return (u1 < u2) ? -1 : +1;
15917 if (u1 == '\0')
15918 return 0;
15919 s1++;
15920 s2++;
15921 }
15922 return 0;
15923}
15924
15925Py_UNICODE*
15926Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15927{
15928 const Py_UNICODE *p;
15929 for (p = s; *p; p++)
15930 if (*p == c)
15931 return (Py_UNICODE*)p;
15932 return NULL;
15933}
15934
15935Py_UNICODE*
15936Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15937{
15938 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015939 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015940 while (p != s) {
15941 p--;
15942 if (*p == c)
15943 return (Py_UNICODE*)p;
15944 }
15945 return NULL;
15946}
Victor Stinner331ea922010-08-10 16:37:20 +000015947
Victor Stinner71133ff2010-09-01 23:43:53 +000015948Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015949PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015950{
Victor Stinner577db2c2011-10-11 22:12:48 +020015951 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015952 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015954 if (!PyUnicode_Check(unicode)) {
15955 PyErr_BadArgument();
15956 return NULL;
15957 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015958 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015959 if (u == NULL)
15960 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015961 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015962 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015963 PyErr_NoMemory();
15964 return NULL;
15965 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015966 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015967 size *= sizeof(Py_UNICODE);
15968 copy = PyMem_Malloc(size);
15969 if (copy == NULL) {
15970 PyErr_NoMemory();
15971 return NULL;
15972 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015973 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015974 return copy;
15975}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015976
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015977
Victor Stinner709d23d2019-05-02 14:56:30 -040015978static int
15979encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015980{
Victor Stinner709d23d2019-05-02 14:56:30 -040015981 int res;
15982 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15983 if (res == -2) {
15984 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15985 return -1;
15986 }
15987 if (res < 0) {
15988 PyErr_NoMemory();
15989 return -1;
15990 }
15991 return 0;
15992}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015993
Victor Stinner709d23d2019-05-02 14:56:30 -040015994
15995static int
15996config_get_codec_name(wchar_t **config_encoding)
15997{
15998 char *encoding;
15999 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16000 return -1;
16001 }
16002
16003 PyObject *name_obj = NULL;
16004 PyObject *codec = _PyCodec_Lookup(encoding);
16005 PyMem_RawFree(encoding);
16006
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016007 if (!codec)
16008 goto error;
16009
16010 name_obj = PyObject_GetAttrString(codec, "name");
16011 Py_CLEAR(codec);
16012 if (!name_obj) {
16013 goto error;
16014 }
16015
Victor Stinner709d23d2019-05-02 14:56:30 -040016016 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16017 Py_DECREF(name_obj);
16018 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016019 goto error;
16020 }
16021
Victor Stinner709d23d2019-05-02 14:56:30 -040016022 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16023 if (raw_wname == NULL) {
16024 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016025 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016026 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016027 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016028
16029 PyMem_RawFree(*config_encoding);
16030 *config_encoding = raw_wname;
16031
16032 PyMem_Free(wname);
16033 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016034
16035error:
16036 Py_XDECREF(codec);
16037 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016038 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016039}
16040
16041
Victor Stinner331a6a52019-05-27 16:39:22 +020016042static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016043init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016044{
Victor Stinner709d23d2019-05-02 14:56:30 -040016045 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016046 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016047 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016048 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016049 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016050 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016051 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016052}
16053
16054
Victor Stinner709d23d2019-05-02 14:56:30 -040016055static int
16056init_fs_codec(PyInterpreterState *interp)
16057{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016058 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016059
16060 _Py_error_handler error_handler;
16061 error_handler = get_error_handler_wide(config->filesystem_errors);
16062 if (error_handler == _Py_ERROR_UNKNOWN) {
16063 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16064 return -1;
16065 }
16066
16067 char *encoding, *errors;
16068 if (encode_wstr_utf8(config->filesystem_encoding,
16069 &encoding,
16070 "filesystem_encoding") < 0) {
16071 return -1;
16072 }
16073
16074 if (encode_wstr_utf8(config->filesystem_errors,
16075 &errors,
16076 "filesystem_errors") < 0) {
16077 PyMem_RawFree(encoding);
16078 return -1;
16079 }
16080
Victor Stinner3d17c042020-05-14 01:48:38 +020016081 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16082 PyMem_RawFree(fs_codec->encoding);
16083 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016084 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016085 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16086 PyMem_RawFree(fs_codec->errors);
16087 fs_codec->errors = errors;
16088 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016089
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016090#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016091 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016092#endif
16093
Victor Stinner709d23d2019-05-02 14:56:30 -040016094 /* At this point, PyUnicode_EncodeFSDefault() and
16095 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16096 the C implementation of the filesystem encoding. */
16097
16098 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16099 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016100 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16101 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016102 PyErr_NoMemory();
16103 return -1;
16104 }
16105 return 0;
16106}
16107
16108
Victor Stinner331a6a52019-05-27 16:39:22 +020016109static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016110init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016111{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016112 PyInterpreterState *interp = tstate->interp;
16113
Victor Stinner709d23d2019-05-02 14:56:30 -040016114 /* Update the filesystem encoding to the normalized Python codec name.
16115 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16116 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016117 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016118 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016119 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016120 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016121 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016122 }
16123
Victor Stinner709d23d2019-05-02 14:56:30 -040016124 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016125 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016126 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016127 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016128}
16129
16130
Victor Stinner331a6a52019-05-27 16:39:22 +020016131PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016132_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016133{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016134 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016135 if (_PyStatus_EXCEPTION(status)) {
16136 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016137 }
16138
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016139 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016140}
16141
16142
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016143static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016144_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016145{
Victor Stinner3d17c042020-05-14 01:48:38 +020016146 PyMem_RawFree(fs_codec->encoding);
16147 fs_codec->encoding = NULL;
16148 fs_codec->utf8 = 0;
16149 PyMem_RawFree(fs_codec->errors);
16150 fs_codec->errors = NULL;
16151 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016152}
16153
16154
Victor Stinner709d23d2019-05-02 14:56:30 -040016155#ifdef MS_WINDOWS
16156int
16157_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16158{
Victor Stinner81a7be32020-04-14 15:14:01 +020016159 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016160 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016161
16162 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16163 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16164 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16165 if (encoding == NULL || errors == NULL) {
16166 PyMem_RawFree(encoding);
16167 PyMem_RawFree(errors);
16168 PyErr_NoMemory();
16169 return -1;
16170 }
16171
16172 PyMem_RawFree(config->filesystem_encoding);
16173 config->filesystem_encoding = encoding;
16174 PyMem_RawFree(config->filesystem_errors);
16175 config->filesystem_errors = errors;
16176
16177 return init_fs_codec(interp);
16178}
16179#endif
16180
16181
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016182void
Victor Stinner3d483342019-11-22 12:27:50 +010016183_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016184{
Victor Stinner3d483342019-11-22 12:27:50 +010016185 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016186#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016187 /* Insure++ is a memory analysis tool that aids in discovering
16188 * memory leaks and other memory problems. On Python exit, the
16189 * interned string dictionaries are flagged as being in use at exit
16190 * (which it is). Under normal circumstances, this is fine because
16191 * the memory will be automatically reclaimed by the system. Under
16192 * memory debugging, it's a huge source of useless noise, so we
16193 * trade off slower shutdown for less distraction in the memory
16194 * reports. -baw
16195 */
16196 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016197#endif /* __INSURE__ */
16198
Victor Stinner3d483342019-11-22 12:27:50 +010016199 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016200
Victor Stinner607b1022020-05-05 18:50:30 +020016201#ifdef LATIN1_SINGLETONS
Victor Stinner3d483342019-11-22 12:27:50 +010016202 for (Py_ssize_t i = 0; i < 256; i++) {
16203 Py_CLEAR(unicode_latin1[i]);
16204 }
Victor Stinner607b1022020-05-05 18:50:30 +020016205#endif
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016206 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016207 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016208
Victor Stinner3d17c042020-05-14 01:48:38 +020016209 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016210}
16211
16212
Georg Brandl66c221e2010-10-14 07:04:07 +000016213/* A _string module, to export formatter_parser and formatter_field_name_split
16214 to the string.Formatter class implemented in Python. */
16215
16216static PyMethodDef _string_methods[] = {
16217 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16218 METH_O, PyDoc_STR("split the argument as a field name")},
16219 {"formatter_parser", (PyCFunction) formatter_parser,
16220 METH_O, PyDoc_STR("parse the argument as a format string")},
16221 {NULL, NULL}
16222};
16223
16224static struct PyModuleDef _string_module = {
16225 PyModuleDef_HEAD_INIT,
16226 "_string",
16227 PyDoc_STR("string helper module"),
16228 0,
16229 _string_methods,
16230 NULL,
16231 NULL,
16232 NULL,
16233 NULL
16234};
16235
16236PyMODINIT_FUNC
16237PyInit__string(void)
16238{
16239 return PyModule_Create(&_string_module);
16240}
16241
16242
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016243#ifdef __cplusplus
16244}
16245#endif