blob: 34b747ec7bb7eef6ccafa68dba48363b424b1176 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020047#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinnerbcda8f12018-11-21 22:27:47 +010048#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020049#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040050#include "pycore_pylifecycle.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020051#include "pycore_pystate.h" // _PyInterpreterState_GET()
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000052#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070053#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000055#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000056#include <windows.h>
57#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000058
Victor Stinnerfecc4f22019-03-19 14:20:29 +010059/* Uncomment to display statistics on interned strings at exit when
60 using Valgrind or Insecure++. */
61/* #define INTERNED_STATS 1 */
62
63
Larry Hastings61272b72014-01-07 12:41:53 -080064/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090065class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080066[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090067/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69/*[python input]
70class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80[python start generated code]*/
81/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080082
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchaka05997252013-01-26 12:14:02 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner8faf8212011-12-08 22:14:11 +010096/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97#define MAX_UNICODE 0x10ffff
98
Victor Stinner910337b2011-10-03 03:20:16 +020099#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200100# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#else
102# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200104
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105#define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200108 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200113#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200116 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200121#define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
123#define _PyUnicode_WSTR_LENGTH(op) \
124 (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) \
126 (((PyASCIIObject *)(op))->length)
127#define _PyUnicode_STATE(op) \
128 (((PyASCIIObject *)(op))->state)
129#define _PyUnicode_HASH(op) \
130 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_KIND(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200134#define _PyUnicode_GET_LENGTH(op) \
135 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200137#define _PyUnicode_DATA_ANY(op) \
138 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139
Victor Stinner910337b2011-10-03 03:20:16 +0200140#undef PyUnicode_READY
141#define PyUnicode_READY(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200144 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100145 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200146
Victor Stinnerc379ead2011-10-03 12:52:27 +0200147#define _PyUnicode_SHARE_UTF8(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
150 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
151#define _PyUnicode_SHARE_WSTR(op) \
152 (assert(_PyUnicode_CHECK(op)), \
153 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
154
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155/* true if the Unicode object has an allocated UTF-8 memory block
156 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200157#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200158 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200159 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200160 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
161
Victor Stinner03490912011-10-03 23:45:12 +0200162/* true if the Unicode object has an allocated wstr memory block
163 (not shared with other data) */
164#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200165 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100176 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600177 const from_type *_iter = (const from_type *)(begin);\
178 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 Py_ssize_t n = (_end) - (_iter); \
180 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200181 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200182 while (_iter < (_unrolled_end)) { \
183 _to[0] = (to_type) _iter[0]; \
184 _to[1] = (to_type) _iter[1]; \
185 _to[2] = (to_type) _iter[2]; \
186 _to[3] = (to_type) _iter[3]; \
187 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200189 while (_iter < (_end)) \
190 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200191 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200192
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200193#ifdef MS_WINDOWS
194 /* On Windows, overallocate by 50% is the best factor */
195# define OVERALLOCATE_FACTOR 2
196#else
197 /* On Linux, overallocate by 25% is the best factor */
198# define OVERALLOCATE_FACTOR 4
199#endif
200
Victor Stinner607b1022020-05-05 18:50:30 +0200201/* bpo-40521: Interned strings are shared by all interpreters. */
202#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
203# define INTERNED_STRINGS
204#endif
205
Walter Dörwald16807132007-05-25 13:52:07 +0000206/* This dictionary holds all interned unicode strings. Note that references
207 to strings in this dictionary are *not* counted in the string's ob_refcnt.
208 When the interned string reaches a refcnt of 0 the string deallocation
209 function will delete the reference from this dictionary.
210
211 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000212 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000213*/
Victor Stinner607b1022020-05-05 18:50:30 +0200214#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200216#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000217
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000218/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200220
Serhiy Storchaka678db842013-01-26 12:16:36 +0200221#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 do { \
223 if (unicode_empty != NULL) \
224 Py_INCREF(unicode_empty); \
225 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226 unicode_empty = PyUnicode_New(0, 0); \
227 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200228 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
230 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200231 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200232 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000233
Serhiy Storchaka678db842013-01-26 12:16:36 +0200234#define _Py_RETURN_UNICODE_EMPTY() \
235 do { \
236 _Py_INCREF_UNICODE_EMPTY(); \
237 return unicode_empty; \
238 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000239
Victor Stinner59423e32018-11-26 13:40:01 +0100240static inline void
241unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
242 Py_ssize_t start, Py_ssize_t length)
243{
244 assert(0 <= start);
245 assert(kind != PyUnicode_WCHAR_KIND);
246 switch (kind) {
247 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100248 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100249 Py_UCS1 ch = (unsigned char)value;
250 Py_UCS1 *to = (Py_UCS1 *)data + start;
251 memset(to, ch, length);
252 break;
253 }
254 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS2 ch = (Py_UCS2)value;
257 Py_UCS2 *to = (Py_UCS2 *)data + start;
258 const Py_UCS2 *end = to + length;
259 for (; to < end; ++to) *to = ch;
260 break;
261 }
262 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100263 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100264 Py_UCS4 ch = value;
265 Py_UCS4 * to = (Py_UCS4 *)data + start;
266 const Py_UCS4 *end = to + length;
267 for (; to < end; ++to) *to = ch;
268 break;
269 }
270 default: Py_UNREACHABLE();
271 }
272}
273
274
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200275/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700276static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900278static inline void
279_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400280static PyObject *
281unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
282 const char *errors);
283static PyObject *
284unicode_decode_utf8(const char *s, Py_ssize_t size,
285 _Py_error_handler error_handler, const char *errors,
286 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200287
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200288/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200289static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200290
Victor Stinner607b1022020-05-05 18:50:30 +0200291/* bpo-40521: Latin1 singletons are shared by all interpreters. */
292#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
293# define LATIN1_SINGLETONS
294#endif
295
296#ifdef LATIN1_SINGLETONS
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297/* Single character Unicode strings in the Latin-1 range are being
298 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200299static PyObject *unicode_latin1[256] = {NULL};
Victor Stinner607b1022020-05-05 18:50:30 +0200300#endif
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000301
Christian Heimes190d79e2008-01-30 11:58:22 +0000302/* Fast detection of the most frequent whitespace characters */
303const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000305/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000306/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000307/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000308/* case 0x000C: * FORM FEED */
309/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000310 0, 1, 1, 1, 1, 1, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000312/* case 0x001C: * FILE SEPARATOR */
313/* case 0x001D: * GROUP SEPARATOR */
314/* case 0x001E: * RECORD SEPARATOR */
315/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000316 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 1, 0, 0, 0, 0, 0, 0, 0,
319 0, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000322
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 0, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000331};
332
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200333/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200334static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200335static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100336static int unicode_modifiable(PyObject *unicode);
337
Victor Stinnerfe226c02011-10-03 03:52:20 +0200338
Alexander Belopolsky40018472011-02-26 01:02:56 +0000339static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100340_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200341static PyObject *
342_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
343static PyObject *
344_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
345
346static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000347unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000348 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100349 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000350 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
351
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352static void
353raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300354 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100355 PyObject *unicode,
356 Py_ssize_t startpos, Py_ssize_t endpos,
357 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000358
Christian Heimes190d79e2008-01-30 11:58:22 +0000359/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200360static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000362/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000363/* 0x000B, * LINE TABULATION */
364/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000365/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000366 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000368/* 0x001C, * FILE SEPARATOR */
369/* 0x001D, * GROUP SEPARATOR */
370/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000371 0, 0, 0, 0, 1, 1, 1, 0,
372 0, 0, 0, 0, 0, 0, 0, 0,
373 0, 0, 0, 0, 0, 0, 0, 0,
374 0, 0, 0, 0, 0, 0, 0, 0,
375 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000376
Benjamin Peterson14339b62009-01-31 16:36:08 +0000377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000385};
386
INADA Naoki3ae20562017-01-16 20:41:20 +0900387static int convert_uc(PyObject *obj, void *addr);
388
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300389#include "clinic/unicodeobject.c.h"
390
Victor Stinner3d4226a2018-08-29 22:21:32 +0200391_Py_error_handler
392_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200393{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200395 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200396 }
397 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 }
400 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200401 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200402 }
403 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200404 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200405 }
406 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200407 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200408 }
409 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200410 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200411 }
412 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200413 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200414 }
Victor Stinner50149202015-09-22 00:26:54 +0200415 return _Py_ERROR_OTHER;
416}
417
Victor Stinner709d23d2019-05-02 14:56:30 -0400418
419static _Py_error_handler
420get_error_handler_wide(const wchar_t *errors)
421{
422 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
423 return _Py_ERROR_STRICT;
424 }
425 if (wcscmp(errors, L"surrogateescape") == 0) {
426 return _Py_ERROR_SURROGATEESCAPE;
427 }
428 if (wcscmp(errors, L"replace") == 0) {
429 return _Py_ERROR_REPLACE;
430 }
431 if (wcscmp(errors, L"ignore") == 0) {
432 return _Py_ERROR_IGNORE;
433 }
434 if (wcscmp(errors, L"backslashreplace") == 0) {
435 return _Py_ERROR_BACKSLASHREPLACE;
436 }
437 if (wcscmp(errors, L"surrogatepass") == 0) {
438 return _Py_ERROR_SURROGATEPASS;
439 }
440 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
441 return _Py_ERROR_XMLCHARREFREPLACE;
442 }
443 return _Py_ERROR_OTHER;
444}
445
446
Victor Stinner22eb6892019-06-26 00:51:05 +0200447static inline int
448unicode_check_encoding_errors(const char *encoding, const char *errors)
449{
450 if (encoding == NULL && errors == NULL) {
451 return 0;
452 }
453
Victor Stinner81a7be32020-04-14 15:14:01 +0200454 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200455#ifndef Py_DEBUG
456 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200457 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200458 return 0;
459 }
460#else
461 /* Always check in debug mode */
462#endif
463
464 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
465 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
466 if (!interp->fs_codec.encoding) {
467 return 0;
468 }
469
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200470 /* Disable checks during Python finalization. For example, it allows to
471 call _PyObject_Dump() during finalization for debugging purpose. */
472 if (interp->finalizing) {
473 return 0;
474 }
475
Victor Stinner22eb6892019-06-26 00:51:05 +0200476 if (encoding != NULL) {
477 PyObject *handler = _PyCodec_Lookup(encoding);
478 if (handler == NULL) {
479 return -1;
480 }
481 Py_DECREF(handler);
482 }
483
484 if (errors != NULL) {
485 PyObject *handler = PyCodec_LookupError(errors);
486 if (handler == NULL) {
487 return -1;
488 }
489 Py_DECREF(handler);
490 }
491 return 0;
492}
493
494
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300495/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
496 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000497Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000498PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000499{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000500#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000501 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000502#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000503 /* This is actually an illegal character, so it should
504 not be passed to unichr. */
505 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000506#endif
507}
508
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200509int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100510_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200511{
Victor Stinner68762572019-10-07 18:42:01 +0200512#define CHECK(expr) \
513 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
514
Victor Stinner910337b2011-10-03 03:20:16 +0200515 PyASCIIObject *ascii;
516 unsigned int kind;
517
Victor Stinner68762572019-10-07 18:42:01 +0200518 assert(op != NULL);
519 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200520
521 ascii = (PyASCIIObject *)op;
522 kind = ascii->state.kind;
523
Victor Stinnera3b334d2011-10-03 13:53:37 +0200524 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND);
526 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200527 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200528 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200529 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200530 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200531
Victor Stinnera41463c2011-10-04 01:05:08 +0200532 if (ascii->state.compact == 1) {
533 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200534 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200535 || kind == PyUnicode_2BYTE_KIND
536 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->state.ascii == 0);
538 CHECK(ascii->state.ready == 1);
539 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100540 }
541 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
543
544 data = unicode->data.any;
545 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(ascii->length == 0);
547 CHECK(ascii->hash == -1);
548 CHECK(ascii->state.compact == 0);
549 CHECK(ascii->state.ascii == 0);
550 CHECK(ascii->state.ready == 0);
551 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
552 CHECK(ascii->wstr != NULL);
553 CHECK(data == NULL);
554 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200555 }
556 else {
Victor Stinner68762572019-10-07 18:42:01 +0200557 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200558 || kind == PyUnicode_2BYTE_KIND
559 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200560 CHECK(ascii->state.compact == 0);
561 CHECK(ascii->state.ready == 1);
562 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200563 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200564 CHECK(compact->utf8 == data);
565 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 }
567 else
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200569 }
570 }
571 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200572 if (
573#if SIZEOF_WCHAR_T == 2
574 kind == PyUnicode_2BYTE_KIND
575#else
576 kind == PyUnicode_4BYTE_KIND
577#endif
578 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 {
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(ascii->wstr == data);
581 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200582 } else
Victor Stinner68762572019-10-07 18:42:01 +0200583 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200584 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200585
586 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200587 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200588 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200589 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200590 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200591
592 /* check that the best kind is used: O(n) operation */
593 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 Py_ssize_t i;
595 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300596 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200597 Py_UCS4 ch;
598
599 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200600 for (i=0; i < ascii->length; i++)
601 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200602 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200603 if (ch > maxchar)
604 maxchar = ch;
605 }
606 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100607 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200608 CHECK(maxchar >= 128);
609 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100610 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200611 else
Victor Stinner68762572019-10-07 18:42:01 +0200612 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200613 }
Victor Stinner77faf692011-11-20 18:56:05 +0100614 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200615 CHECK(maxchar >= 0x100);
616 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100617 }
618 else {
Victor Stinner68762572019-10-07 18:42:01 +0200619 CHECK(maxchar >= 0x10000);
620 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100621 }
Victor Stinner68762572019-10-07 18:42:01 +0200622 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200623 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400624 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200625
626#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400627}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200628
Victor Stinner910337b2011-10-03 03:20:16 +0200629
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100630static PyObject*
631unicode_result_wchar(PyObject *unicode)
632{
633#ifndef Py_DEBUG
634 Py_ssize_t len;
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 len = _PyUnicode_WSTR_LENGTH(unicode);
637 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200639 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100640 }
641
642 if (len == 1) {
643 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100644 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100645 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
646 Py_DECREF(unicode);
647 return latin1_char;
648 }
649 }
650
651 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200652 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 return NULL;
654 }
655#else
Victor Stinneraa771272012-10-04 02:32:58 +0200656 assert(Py_REFCNT(unicode) == 1);
657
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100658 /* don't make the result ready in debug mode to ensure that the caller
659 makes the string ready before using it */
660 assert(_PyUnicode_CheckConsistency(unicode, 1));
661#endif
662 return unicode;
663}
664
665static PyObject*
666unicode_result_ready(PyObject *unicode)
667{
668 Py_ssize_t length;
669
670 length = PyUnicode_GET_LENGTH(unicode);
671 if (length == 0) {
672 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100673 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200674 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100675 }
676 return unicode_empty;
677 }
678
Victor Stinner607b1022020-05-05 18:50:30 +0200679#ifdef LATIN1_SINGLETONS
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100680 if (length == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300681 const void *data = PyUnicode_DATA(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +0200682 int kind = PyUnicode_KIND(unicode);
683 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100684 if (ch < 256) {
685 PyObject *latin1_char = unicode_latin1[ch];
686 if (latin1_char != NULL) {
687 if (unicode != latin1_char) {
688 Py_INCREF(latin1_char);
689 Py_DECREF(unicode);
690 }
691 return latin1_char;
692 }
693 else {
694 assert(_PyUnicode_CheckConsistency(unicode, 1));
695 Py_INCREF(unicode);
696 unicode_latin1[ch] = unicode;
697 return unicode;
698 }
699 }
700 }
Victor Stinner607b1022020-05-05 18:50:30 +0200701#endif
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100702
703 assert(_PyUnicode_CheckConsistency(unicode, 1));
704 return unicode;
705}
706
707static PyObject*
708unicode_result(PyObject *unicode)
709{
710 assert(_PyUnicode_CHECK(unicode));
711 if (PyUnicode_IS_READY(unicode))
712 return unicode_result_ready(unicode);
713 else
714 return unicode_result_wchar(unicode);
715}
716
Victor Stinnerc4b49542011-12-11 22:44:26 +0100717static PyObject*
718unicode_result_unchanged(PyObject *unicode)
719{
720 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500721 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722 return NULL;
723 Py_INCREF(unicode);
724 return unicode;
725 }
726 else
727 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100728 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100729}
730
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
732 ASCII, Latin1, UTF-8, etc. */
733static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200734backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
736{
Victor Stinnerad771582015-10-09 12:38:53 +0200737 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200738 Py_UCS4 ch;
739 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300740 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200741
742 assert(PyUnicode_IS_READY(unicode));
743 kind = PyUnicode_KIND(unicode);
744 data = PyUnicode_DATA(unicode);
745
746 size = 0;
747 /* determine replacement size */
748 for (i = collstart; i < collend; ++i) {
749 Py_ssize_t incr;
750
751 ch = PyUnicode_READ(kind, data, i);
752 if (ch < 0x100)
753 incr = 2+2;
754 else if (ch < 0x10000)
755 incr = 2+4;
756 else {
757 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200758 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200759 }
760 if (size > PY_SSIZE_T_MAX - incr) {
761 PyErr_SetString(PyExc_OverflowError,
762 "encoded result is too long for a Python string");
763 return NULL;
764 }
765 size += incr;
766 }
767
Victor Stinnerad771582015-10-09 12:38:53 +0200768 str = _PyBytesWriter_Prepare(writer, str, size);
769 if (str == NULL)
770 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771
772 /* generate replacement */
773 for (i = collstart; i < collend; ++i) {
774 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200775 *str++ = '\\';
776 if (ch >= 0x00010000) {
777 *str++ = 'U';
778 *str++ = Py_hexdigits[(ch>>28)&0xf];
779 *str++ = Py_hexdigits[(ch>>24)&0xf];
780 *str++ = Py_hexdigits[(ch>>20)&0xf];
781 *str++ = Py_hexdigits[(ch>>16)&0xf];
782 *str++ = Py_hexdigits[(ch>>12)&0xf];
783 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200784 }
Victor Stinner797485e2015-10-09 03:17:30 +0200785 else if (ch >= 0x100) {
786 *str++ = 'u';
787 *str++ = Py_hexdigits[(ch>>12)&0xf];
788 *str++ = Py_hexdigits[(ch>>8)&0xf];
789 }
790 else
791 *str++ = 'x';
792 *str++ = Py_hexdigits[(ch>>4)&0xf];
793 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200794 }
795 return str;
796}
797
798/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
799 ASCII, Latin1, UTF-8, etc. */
800static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200801xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200802 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
803{
Victor Stinnerad771582015-10-09 12:38:53 +0200804 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200805 Py_UCS4 ch;
806 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300807 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200808
809 assert(PyUnicode_IS_READY(unicode));
810 kind = PyUnicode_KIND(unicode);
811 data = PyUnicode_DATA(unicode);
812
813 size = 0;
814 /* determine replacement size */
815 for (i = collstart; i < collend; ++i) {
816 Py_ssize_t incr;
817
818 ch = PyUnicode_READ(kind, data, i);
819 if (ch < 10)
820 incr = 2+1+1;
821 else if (ch < 100)
822 incr = 2+2+1;
823 else if (ch < 1000)
824 incr = 2+3+1;
825 else if (ch < 10000)
826 incr = 2+4+1;
827 else if (ch < 100000)
828 incr = 2+5+1;
829 else if (ch < 1000000)
830 incr = 2+6+1;
831 else {
832 assert(ch <= MAX_UNICODE);
833 incr = 2+7+1;
834 }
835 if (size > PY_SSIZE_T_MAX - incr) {
836 PyErr_SetString(PyExc_OverflowError,
837 "encoded result is too long for a Python string");
838 return NULL;
839 }
840 size += incr;
841 }
842
Victor Stinnerad771582015-10-09 12:38:53 +0200843 str = _PyBytesWriter_Prepare(writer, str, size);
844 if (str == NULL)
845 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200846
847 /* generate replacement */
848 for (i = collstart; i < collend; ++i) {
849 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
850 }
851 return str;
852}
853
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854/* --- Bloom Filters ----------------------------------------------------- */
855
856/* stuff to implement simple "bloom filters" for Unicode characters.
857 to keep things simple, we use a single bitmask, using the least 5
858 bits from each unicode characters as the bit index. */
859
860/* the linebreak mask is set up by Unicode_Init below */
861
Antoine Pitrouf068f942010-01-13 14:19:12 +0000862#if LONG_BIT >= 128
863#define BLOOM_WIDTH 128
864#elif LONG_BIT >= 64
865#define BLOOM_WIDTH 64
866#elif LONG_BIT >= 32
867#define BLOOM_WIDTH 32
868#else
869#error "LONG_BIT is smaller than 32"
870#endif
871
Thomas Wouters477c8d52006-05-27 19:21:47 +0000872#define BLOOM_MASK unsigned long
873
Serhiy Storchaka05997252013-01-26 12:14:02 +0200874static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000875
Antoine Pitrouf068f942010-01-13 14:19:12 +0000876#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877
Benjamin Peterson29060642009-01-31 22:14:21 +0000878#define BLOOM_LINEBREAK(ch) \
879 ((ch) < 128U ? ascii_linebreak[(ch)] : \
880 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000881
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700882static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300883make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884{
Victor Stinnera85af502013-04-09 21:53:54 +0200885#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
886 do { \
887 TYPE *data = (TYPE *)PTR; \
888 TYPE *end = data + LEN; \
889 Py_UCS4 ch; \
890 for (; data != end; data++) { \
891 ch = *data; \
892 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
893 } \
894 break; \
895 } while (0)
896
Thomas Wouters477c8d52006-05-27 19:21:47 +0000897 /* calculate simple bloom-style bitmask for a given unicode string */
898
Antoine Pitrouf068f942010-01-13 14:19:12 +0000899 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000900
901 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200902 switch (kind) {
903 case PyUnicode_1BYTE_KIND:
904 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
905 break;
906 case PyUnicode_2BYTE_KIND:
907 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
908 break;
909 case PyUnicode_4BYTE_KIND:
910 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
911 break;
912 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700913 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200914 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000915 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200916
917#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000918}
919
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920static int
921ensure_unicode(PyObject *obj)
922{
923 if (!PyUnicode_Check(obj)) {
924 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200925 "must be str, not %.100s",
926 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300927 return -1;
928 }
929 return PyUnicode_READY(obj);
930}
931
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200932/* Compilation of templated routines */
933
934#include "stringlib/asciilib.h"
935#include "stringlib/fastsearch.h"
936#include "stringlib/partition.h"
937#include "stringlib/split.h"
938#include "stringlib/count.h"
939#include "stringlib/find.h"
940#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200941#include "stringlib/undef.h"
942
943#include "stringlib/ucs1lib.h"
944#include "stringlib/fastsearch.h"
945#include "stringlib/partition.h"
946#include "stringlib/split.h"
947#include "stringlib/count.h"
948#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300949#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200950#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200951#include "stringlib/undef.h"
952
953#include "stringlib/ucs2lib.h"
954#include "stringlib/fastsearch.h"
955#include "stringlib/partition.h"
956#include "stringlib/split.h"
957#include "stringlib/count.h"
958#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300959#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200960#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200961#include "stringlib/undef.h"
962
963#include "stringlib/ucs4lib.h"
964#include "stringlib/fastsearch.h"
965#include "stringlib/partition.h"
966#include "stringlib/split.h"
967#include "stringlib/count.h"
968#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300969#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200970#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200971#include "stringlib/undef.h"
972
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200973#include "stringlib/unicodedefs.h"
974#include "stringlib/fastsearch.h"
975#include "stringlib/count.h"
976#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100977#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 if (length == 0 && unicode_empty != NULL) {
1228 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001229 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 }
1231
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001232 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001233 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001234 return (PyUnicodeObject *)PyErr_NoMemory();
1235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001236 if (length < 0) {
1237 PyErr_SetString(PyExc_SystemError,
1238 "Negative size passed to _PyUnicode_New");
1239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 }
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1243 if (unicode == NULL)
1244 return NULL;
1245 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001246
1247 _PyUnicode_WSTR_LENGTH(unicode) = length;
1248 _PyUnicode_HASH(unicode) = -1;
1249 _PyUnicode_STATE(unicode).interned = 0;
1250 _PyUnicode_STATE(unicode).kind = 0;
1251 _PyUnicode_STATE(unicode).compact = 0;
1252 _PyUnicode_STATE(unicode).ready = 0;
1253 _PyUnicode_STATE(unicode).ascii = 0;
1254 _PyUnicode_DATA_ANY(unicode) = NULL;
1255 _PyUnicode_LENGTH(unicode) = 0;
1256 _PyUnicode_UTF8(unicode) = NULL;
1257 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1260 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001261 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001262 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001263 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001265
Jeremy Hyltond8082792003-09-16 19:41:39 +00001266 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001267 * the caller fails before initializing str -- unicode_resize()
1268 * reads str[0], and the Keep-Alive optimization can keep memory
1269 * allocated for str alive across a call to unicode_dealloc(unicode).
1270 * We don't want unicode_resize to read uninitialized memory in
1271 * that case.
1272 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 _PyUnicode_WSTR(unicode)[0] = 0;
1274 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001275
Victor Stinner7931d9a2011-11-04 00:22:48 +01001276 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 return unicode;
1278}
1279
Victor Stinnerf42dc442011-10-02 23:33:16 +02001280static const char*
1281unicode_kind_name(PyObject *unicode)
1282{
Victor Stinner42dfd712011-10-03 14:41:45 +02001283 /* don't check consistency: unicode_kind_name() is called from
1284 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001285 if (!PyUnicode_IS_COMPACT(unicode))
1286 {
1287 if (!PyUnicode_IS_READY(unicode))
1288 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001289 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001290 {
1291 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001292 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001293 return "legacy ascii";
1294 else
1295 return "legacy latin1";
1296 case PyUnicode_2BYTE_KIND:
1297 return "legacy UCS2";
1298 case PyUnicode_4BYTE_KIND:
1299 return "legacy UCS4";
1300 default:
1301 return "<legacy invalid kind>";
1302 }
1303 }
1304 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001305 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001306 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001307 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001308 return "ascii";
1309 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001310 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001311 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001312 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001313 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001314 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001315 default:
1316 return "<invalid compact kind>";
1317 }
1318}
1319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001322const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001323 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001324 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325}
1326
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001327const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001328 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329 return _PyUnicode_COMPACT_DATA(unicode);
1330}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001331const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001332 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001333 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1335 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1336 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1337 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1338 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1339 return PyUnicode_DATA(unicode);
1340}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001341
1342void
1343_PyUnicode_Dump(PyObject *op)
1344{
1345 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001346 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1347 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001348 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001349
Victor Stinnera849a4b2011-10-03 12:12:11 +02001350 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001351 {
1352 if (ascii->state.ascii)
1353 data = (ascii + 1);
1354 else
1355 data = (compact + 1);
1356 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001357 else
1358 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001359 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1360 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001361
Victor Stinnera849a4b2011-10-03 12:12:11 +02001362 if (ascii->wstr == data)
1363 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001364 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001365
Victor Stinnera3b334d2011-10-03 13:53:37 +02001366 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001367 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001368 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1369 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001370 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001371 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001372 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001373 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001374}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375#endif
1376
1377PyObject *
1378PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1379{
1380 PyObject *obj;
1381 PyCompactUnicodeObject *unicode;
1382 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001383 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001384 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 Py_ssize_t char_size;
1386 Py_ssize_t struct_size;
1387
1388 /* Optimization for empty strings */
1389 if (size == 0 && unicode_empty != NULL) {
1390 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001391 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 }
1393
Victor Stinner9e9d6892011-10-04 01:02:02 +02001394 is_ascii = 0;
1395 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 struct_size = sizeof(PyCompactUnicodeObject);
1397 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001398 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 char_size = 1;
1400 is_ascii = 1;
1401 struct_size = sizeof(PyASCIIObject);
1402 }
1403 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001404 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 char_size = 1;
1406 }
1407 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001408 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 char_size = 2;
1410 if (sizeof(wchar_t) == 2)
1411 is_sharing = 1;
1412 }
1413 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001414 if (maxchar > MAX_UNICODE) {
1415 PyErr_SetString(PyExc_SystemError,
1416 "invalid maximum character passed to PyUnicode_New");
1417 return NULL;
1418 }
Victor Stinner8f825062012-04-27 13:55:39 +02001419 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 char_size = 4;
1421 if (sizeof(wchar_t) == 4)
1422 is_sharing = 1;
1423 }
1424
1425 /* Ensure we won't overflow the size. */
1426 if (size < 0) {
1427 PyErr_SetString(PyExc_SystemError,
1428 "Negative size passed to PyUnicode_New");
1429 return NULL;
1430 }
1431 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1432 return PyErr_NoMemory();
1433
1434 /* Duplicated allocation code from _PyObject_New() instead of a call to
1435 * PyObject_New() so we are able to allocate space for the object and
1436 * it's data buffer.
1437 */
1438 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1439 if (obj == NULL)
1440 return PyErr_NoMemory();
1441 obj = PyObject_INIT(obj, &PyUnicode_Type);
1442 if (obj == NULL)
1443 return NULL;
1444
1445 unicode = (PyCompactUnicodeObject *)obj;
1446 if (is_ascii)
1447 data = ((PyASCIIObject*)obj) + 1;
1448 else
1449 data = unicode + 1;
1450 _PyUnicode_LENGTH(unicode) = size;
1451 _PyUnicode_HASH(unicode) = -1;
1452 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001453 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 _PyUnicode_STATE(unicode).compact = 1;
1455 _PyUnicode_STATE(unicode).ready = 1;
1456 _PyUnicode_STATE(unicode).ascii = is_ascii;
1457 if (is_ascii) {
1458 ((char*)data)[size] = 0;
1459 _PyUnicode_WSTR(unicode) = NULL;
1460 }
Victor Stinner8f825062012-04-27 13:55:39 +02001461 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 ((char*)data)[size] = 0;
1463 _PyUnicode_WSTR(unicode) = NULL;
1464 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001466 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 else {
1469 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001470 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001471 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001473 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 ((Py_UCS4*)data)[size] = 0;
1475 if (is_sharing) {
1476 _PyUnicode_WSTR_LENGTH(unicode) = size;
1477 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1478 }
1479 else {
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1481 _PyUnicode_WSTR(unicode) = NULL;
1482 }
1483 }
Victor Stinner8f825062012-04-27 13:55:39 +02001484#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001485 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001486#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001487 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 return obj;
1489}
1490
1491#if SIZEOF_WCHAR_T == 2
1492/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1493 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001494 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495
1496 This function assumes that unicode can hold one more code point than wstr
1497 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001498static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001500 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501{
1502 const wchar_t *iter;
1503 Py_UCS4 *ucs4_out;
1504
Victor Stinner910337b2011-10-03 03:20:16 +02001505 assert(unicode != NULL);
1506 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1508 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1509
1510 for (iter = begin; iter < end; ) {
1511 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1512 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001513 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1514 && (iter+1) < end
1515 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 {
Victor Stinner551ac952011-11-29 22:58:13 +01001517 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 iter += 2;
1519 }
1520 else {
1521 *ucs4_out++ = *iter;
1522 iter++;
1523 }
1524 }
1525 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1526 _PyUnicode_GET_LENGTH(unicode)));
1527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528}
1529#endif
1530
Victor Stinnercd9950f2011-10-02 00:34:53 +02001531static int
Victor Stinner488fa492011-12-12 00:01:39 +01001532unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001533{
Victor Stinner488fa492011-12-12 00:01:39 +01001534 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001535 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001536 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001537 return -1;
1538 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001539 return 0;
1540}
1541
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001542static int
1543_copy_characters(PyObject *to, Py_ssize_t to_start,
1544 PyObject *from, Py_ssize_t from_start,
1545 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001546{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001547 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001548 const void *from_data;
1549 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550
Victor Stinneree4544c2012-05-09 22:24:08 +02001551 assert(0 <= how_many);
1552 assert(0 <= from_start);
1553 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001554 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001555 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001556 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557
Victor Stinnerd3f08822012-05-29 12:57:52 +02001558 assert(PyUnicode_Check(to));
1559 assert(PyUnicode_IS_READY(to));
1560 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1561
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001562 if (how_many == 0)
1563 return 0;
1564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001566 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001568 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569
Victor Stinnerf1852262012-06-16 16:38:26 +02001570#ifdef Py_DEBUG
1571 if (!check_maxchar
1572 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1573 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001574 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001575 Py_UCS4 ch;
1576 Py_ssize_t i;
1577 for (i=0; i < how_many; i++) {
1578 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1579 assert(ch <= to_maxchar);
1580 }
1581 }
1582#endif
1583
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001584 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001585 if (check_maxchar
1586 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1587 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001588 /* Writing Latin-1 characters into an ASCII string requires to
1589 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001590 Py_UCS4 max_char;
1591 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001592 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001593 if (max_char >= 128)
1594 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001595 }
Christian Heimesf051e432016-09-13 20:22:02 +02001596 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001597 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001598 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001600 else if (from_kind == PyUnicode_1BYTE_KIND
1601 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001602 {
1603 _PyUnicode_CONVERT_BYTES(
1604 Py_UCS1, Py_UCS2,
1605 PyUnicode_1BYTE_DATA(from) + from_start,
1606 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1607 PyUnicode_2BYTE_DATA(to) + to_start
1608 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001609 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001610 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001611 && to_kind == PyUnicode_4BYTE_KIND)
1612 {
1613 _PyUnicode_CONVERT_BYTES(
1614 Py_UCS1, Py_UCS4,
1615 PyUnicode_1BYTE_DATA(from) + from_start,
1616 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1617 PyUnicode_4BYTE_DATA(to) + to_start
1618 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001619 }
1620 else if (from_kind == PyUnicode_2BYTE_KIND
1621 && to_kind == PyUnicode_4BYTE_KIND)
1622 {
1623 _PyUnicode_CONVERT_BYTES(
1624 Py_UCS2, Py_UCS4,
1625 PyUnicode_2BYTE_DATA(from) + from_start,
1626 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1627 PyUnicode_4BYTE_DATA(to) + to_start
1628 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001629 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001630 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001631 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1632
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001633 if (!check_maxchar) {
1634 if (from_kind == PyUnicode_2BYTE_KIND
1635 && to_kind == PyUnicode_1BYTE_KIND)
1636 {
1637 _PyUnicode_CONVERT_BYTES(
1638 Py_UCS2, Py_UCS1,
1639 PyUnicode_2BYTE_DATA(from) + from_start,
1640 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1641 PyUnicode_1BYTE_DATA(to) + to_start
1642 );
1643 }
1644 else if (from_kind == PyUnicode_4BYTE_KIND
1645 && to_kind == PyUnicode_1BYTE_KIND)
1646 {
1647 _PyUnicode_CONVERT_BYTES(
1648 Py_UCS4, Py_UCS1,
1649 PyUnicode_4BYTE_DATA(from) + from_start,
1650 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1651 PyUnicode_1BYTE_DATA(to) + to_start
1652 );
1653 }
1654 else if (from_kind == PyUnicode_4BYTE_KIND
1655 && to_kind == PyUnicode_2BYTE_KIND)
1656 {
1657 _PyUnicode_CONVERT_BYTES(
1658 Py_UCS4, Py_UCS2,
1659 PyUnicode_4BYTE_DATA(from) + from_start,
1660 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1661 PyUnicode_2BYTE_DATA(to) + to_start
1662 );
1663 }
1664 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001665 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001666 }
1667 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001668 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001669 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001670 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001671 Py_ssize_t i;
1672
Victor Stinnera0702ab2011-09-29 14:14:38 +02001673 for (i=0; i < how_many; i++) {
1674 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001675 if (ch > to_maxchar)
1676 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001677 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1678 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001679 }
1680 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001681 return 0;
1682}
1683
Victor Stinnerd3f08822012-05-29 12:57:52 +02001684void
1685_PyUnicode_FastCopyCharacters(
1686 PyObject *to, Py_ssize_t to_start,
1687 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001688{
1689 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1690}
1691
1692Py_ssize_t
1693PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1694 PyObject *from, Py_ssize_t from_start,
1695 Py_ssize_t how_many)
1696{
1697 int err;
1698
1699 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1700 PyErr_BadInternalCall();
1701 return -1;
1702 }
1703
Benjamin Petersonbac79492012-01-14 13:34:47 -05001704 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001705 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001706 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001707 return -1;
1708
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001709 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001710 PyErr_SetString(PyExc_IndexError, "string index out of range");
1711 return -1;
1712 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001713 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001714 PyErr_SetString(PyExc_IndexError, "string index out of range");
1715 return -1;
1716 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001717 if (how_many < 0) {
1718 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1719 return -1;
1720 }
1721 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001722 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1723 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001724 "Cannot write %zi characters at %zi "
1725 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001726 how_many, to_start, PyUnicode_GET_LENGTH(to));
1727 return -1;
1728 }
1729
1730 if (how_many == 0)
1731 return 0;
1732
Victor Stinner488fa492011-12-12 00:01:39 +01001733 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001734 return -1;
1735
1736 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1737 if (err) {
1738 PyErr_Format(PyExc_SystemError,
1739 "Cannot copy %s characters "
1740 "into a string of %s characters",
1741 unicode_kind_name(from),
1742 unicode_kind_name(to));
1743 return -1;
1744 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001745 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746}
1747
Victor Stinner17222162011-09-28 22:15:37 +02001748/* Find the maximum code point and count the number of surrogate pairs so a
1749 correct string length can be computed before converting a string to UCS4.
1750 This function counts single surrogates as a character and not as a pair.
1751
1752 Return 0 on success, or -1 on error. */
1753static int
1754find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1755 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756{
1757 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001758 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759
Victor Stinnerc53be962011-10-02 21:33:54 +02001760 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 *num_surrogates = 0;
1762 *maxchar = 0;
1763
1764 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001766 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1767 && (iter+1) < end
1768 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1769 {
1770 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1771 ++(*num_surrogates);
1772 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 }
1774 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001776 {
1777 ch = *iter;
1778 iter++;
1779 }
1780 if (ch > *maxchar) {
1781 *maxchar = ch;
1782 if (*maxchar > MAX_UNICODE) {
1783 PyErr_Format(PyExc_ValueError,
1784 "character U+%x is not in range [U+0000; U+10ffff]",
1785 ch);
1786 return -1;
1787 }
1788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
1790 return 0;
1791}
1792
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001793int
1794_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795{
1796 wchar_t *end;
1797 Py_UCS4 maxchar = 0;
1798 Py_ssize_t num_surrogates;
1799#if SIZEOF_WCHAR_T == 2
1800 Py_ssize_t length_wo_surrogates;
1801#endif
1802
Georg Brandl7597add2011-10-05 16:36:47 +02001803 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001804 strings were created using _PyObject_New() and where no canonical
1805 representation (the str field) has been set yet aka strings
1806 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001807 assert(_PyUnicode_CHECK(unicode));
1808 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001810 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001811 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001812 /* Actually, it should neither be interned nor be anything else: */
1813 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001816 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001817 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819
1820 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001821 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1822 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 PyErr_NoMemory();
1824 return -1;
1825 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001826 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 _PyUnicode_WSTR(unicode), end,
1828 PyUnicode_1BYTE_DATA(unicode));
1829 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1830 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1831 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1832 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001833 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001834 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001835 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 }
1837 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001838 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001839 _PyUnicode_UTF8(unicode) = NULL;
1840 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 }
1842 PyObject_FREE(_PyUnicode_WSTR(unicode));
1843 _PyUnicode_WSTR(unicode) = NULL;
1844 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1845 }
1846 /* In this case we might have to convert down from 4-byte native
1847 wchar_t to 2-byte unicode. */
1848 else if (maxchar < 65536) {
1849 assert(num_surrogates == 0 &&
1850 "FindMaxCharAndNumSurrogatePairs() messed up");
1851
Victor Stinner506f5922011-09-28 22:34:18 +02001852#if SIZEOF_WCHAR_T == 2
1853 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001854 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001855 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1856 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1857 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001858 _PyUnicode_UTF8(unicode) = NULL;
1859 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001860#else
1861 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001862 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001863 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001864 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001865 PyErr_NoMemory();
1866 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 }
Victor Stinner506f5922011-09-28 22:34:18 +02001868 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1869 _PyUnicode_WSTR(unicode), end,
1870 PyUnicode_2BYTE_DATA(unicode));
1871 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1872 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1873 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001874 _PyUnicode_UTF8(unicode) = NULL;
1875 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001876 PyObject_FREE(_PyUnicode_WSTR(unicode));
1877 _PyUnicode_WSTR(unicode) = NULL;
1878 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1879#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001880 }
1881 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1882 else {
1883#if SIZEOF_WCHAR_T == 2
1884 /* in case the native representation is 2-bytes, we need to allocate a
1885 new normalized 4-byte version. */
1886 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001887 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1888 PyErr_NoMemory();
1889 return -1;
1890 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001891 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1892 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 PyErr_NoMemory();
1894 return -1;
1895 }
1896 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1897 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001898 _PyUnicode_UTF8(unicode) = NULL;
1899 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001900 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1901 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001902 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 PyObject_FREE(_PyUnicode_WSTR(unicode));
1904 _PyUnicode_WSTR(unicode) = NULL;
1905 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1906#else
1907 assert(num_surrogates == 0);
1908
Victor Stinnerc3c74152011-10-02 20:39:55 +02001909 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001911 _PyUnicode_UTF8(unicode) = NULL;
1912 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1914#endif
1915 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1916 }
1917 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001918 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 return 0;
1920}
1921
Alexander Belopolsky40018472011-02-26 01:02:56 +00001922static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001923unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924{
Walter Dörwald16807132007-05-25 13:52:07 +00001925 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001926 case SSTATE_NOT_INTERNED:
1927 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001928
Benjamin Peterson29060642009-01-31 22:14:21 +00001929 case SSTATE_INTERNED_MORTAL:
1930 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001931 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001932#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001933 if (PyDict_DelItem(interned, unicode) != 0) {
1934 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1935 NULL);
1936 }
Victor Stinner607b1022020-05-05 18:50:30 +02001937#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001938 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001939
Benjamin Peterson29060642009-01-31 22:14:21 +00001940 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001941 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1942 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001943
Benjamin Peterson29060642009-01-31 22:14:21 +00001944 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001945 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001946 }
1947
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001948 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001950 }
1951 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001952 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001953 }
1954 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001955 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001958 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959}
1960
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001961#ifdef Py_DEBUG
1962static int
1963unicode_is_singleton(PyObject *unicode)
1964{
Victor Stinner607b1022020-05-05 18:50:30 +02001965 if (unicode == unicode_empty) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001966 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001967 }
1968#ifdef LATIN1_SINGLETONS
1969 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001970 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1971 {
1972 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1973 if (ch < 256 && unicode_latin1[ch] == unicode)
1974 return 1;
1975 }
Victor Stinner607b1022020-05-05 18:50:30 +02001976#endif
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001977 return 0;
1978}
1979#endif
1980
Alexander Belopolsky40018472011-02-26 01:02:56 +00001981static int
Victor Stinner488fa492011-12-12 00:01:39 +01001982unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001983{
Victor Stinner488fa492011-12-12 00:01:39 +01001984 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001985 if (Py_REFCNT(unicode) != 1)
1986 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001987 if (_PyUnicode_HASH(unicode) != -1)
1988 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001989 if (PyUnicode_CHECK_INTERNED(unicode))
1990 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001991 if (!PyUnicode_CheckExact(unicode))
1992 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001993#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001994 /* singleton refcount is greater than 1 */
1995 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001996#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001997 return 1;
1998}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999
Victor Stinnerfe226c02011-10-03 03:52:20 +02002000static int
2001unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2002{
2003 PyObject *unicode;
2004 Py_ssize_t old_length;
2005
2006 assert(p_unicode != NULL);
2007 unicode = *p_unicode;
2008
2009 assert(unicode != NULL);
2010 assert(PyUnicode_Check(unicode));
2011 assert(0 <= length);
2012
Victor Stinner910337b2011-10-03 03:20:16 +02002013 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002014 old_length = PyUnicode_WSTR_LENGTH(unicode);
2015 else
2016 old_length = PyUnicode_GET_LENGTH(unicode);
2017 if (old_length == length)
2018 return 0;
2019
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002020 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002021 _Py_INCREF_UNICODE_EMPTY();
2022 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00002023 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002024 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002025 return 0;
2026 }
2027
Victor Stinner488fa492011-12-12 00:01:39 +01002028 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002029 PyObject *copy = resize_copy(unicode, length);
2030 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002032 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002033 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 }
2035
Victor Stinnerfe226c02011-10-03 03:52:20 +02002036 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002037 PyObject *new_unicode = resize_compact(unicode, length);
2038 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002039 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002040 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002041 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002042 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002043 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002044}
2045
Alexander Belopolsky40018472011-02-26 01:02:56 +00002046int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002047PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002048{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002049 PyObject *unicode;
2050 if (p_unicode == NULL) {
2051 PyErr_BadInternalCall();
2052 return -1;
2053 }
2054 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002055 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002056 {
2057 PyErr_BadInternalCall();
2058 return -1;
2059 }
2060 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002061}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002062
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002063/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002064
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002065 WARNING: The function doesn't copy the terminating null character and
2066 doesn't check the maximum character (may write a latin1 character in an
2067 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002068static void
2069unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2070 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002071{
2072 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002073 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002074 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002075
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002076 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002077 switch (kind) {
2078 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002079#ifdef Py_DEBUG
2080 if (PyUnicode_IS_ASCII(unicode)) {
2081 Py_UCS4 maxchar = ucs1lib_find_max_char(
2082 (const Py_UCS1*)str,
2083 (const Py_UCS1*)str + len);
2084 assert(maxchar < 128);
2085 }
2086#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002087 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002088 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002089 }
2090 case PyUnicode_2BYTE_KIND: {
2091 Py_UCS2 *start = (Py_UCS2 *)data + index;
2092 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002093
Victor Stinner184252a2012-06-16 02:57:41 +02002094 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002095 *ucs2 = (Py_UCS2)*str;
2096
2097 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002098 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002099 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002100 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002101 Py_UCS4 *start = (Py_UCS4 *)data + index;
2102 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103
Victor Stinner184252a2012-06-16 02:57:41 +02002104 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002105 *ucs4 = (Py_UCS4)*str;
2106
2107 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002108 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002110 default:
2111 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002112 }
2113}
2114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115static PyObject*
2116get_latin1_char(unsigned char ch)
2117{
Victor Stinner607b1022020-05-05 18:50:30 +02002118 PyObject *unicode;
2119
2120#ifdef LATIN1_SINGLETONS
2121 unicode = unicode_latin1[ch];
2122 if (unicode) {
2123 Py_INCREF(unicode);
2124 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 }
Victor Stinner607b1022020-05-05 18:50:30 +02002126#endif
2127
2128 unicode = PyUnicode_New(1, ch);
2129 if (!unicode) {
2130 return NULL;
2131 }
2132
2133 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2134 assert(_PyUnicode_CheckConsistency(unicode, 1));
2135
2136#ifdef LATIN1_SINGLETONS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137 Py_INCREF(unicode);
Victor Stinner607b1022020-05-05 18:50:30 +02002138 unicode_latin1[ch] = unicode;
2139#endif
Victor Stinnera464fc12011-10-02 20:39:30 +02002140 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002141}
2142
Victor Stinner985a82a2014-01-03 12:53:47 +01002143static PyObject*
2144unicode_char(Py_UCS4 ch)
2145{
2146 PyObject *unicode;
2147
2148 assert(ch <= MAX_UNICODE);
2149
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002150 if (ch < 256)
2151 return get_latin1_char(ch);
2152
Victor Stinner985a82a2014-01-03 12:53:47 +01002153 unicode = PyUnicode_New(1, ch);
2154 if (unicode == NULL)
2155 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002156
2157 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2158 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002159 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002160 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002161 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2162 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2163 }
2164 assert(_PyUnicode_CheckConsistency(unicode, 1));
2165 return unicode;
2166}
2167
Alexander Belopolsky40018472011-02-26 01:02:56 +00002168PyObject *
2169PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002171 if (u == NULL)
2172 return (PyObject*)_PyUnicode_New(size);
2173
2174 if (size < 0) {
2175 PyErr_BadInternalCall();
2176 return NULL;
2177 }
2178
2179 return PyUnicode_FromWideChar(u, size);
2180}
2181
2182PyObject *
2183PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2184{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002185 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 Py_UCS4 maxchar = 0;
2187 Py_ssize_t num_surrogates;
2188
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002189 if (u == NULL && size != 0) {
2190 PyErr_BadInternalCall();
2191 return NULL;
2192 }
2193
2194 if (size == -1) {
2195 size = wcslen(u);
2196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002198 /* If the Unicode data is known at construction time, we can apply
2199 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002202 if (size == 0)
2203 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 /* Single character Unicode objects in the Latin-1 range are
2206 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002207 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 return get_latin1_char((unsigned char)*u);
2209
2210 /* If not empty and not single character, copy the Unicode data
2211 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002212 if (find_maxchar_surrogates(u, u + size,
2213 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 return NULL;
2215
Victor Stinner8faf8212011-12-08 22:14:11 +01002216 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 if (!unicode)
2218 return NULL;
2219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 switch (PyUnicode_KIND(unicode)) {
2221 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002222 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2224 break;
2225 case PyUnicode_2BYTE_KIND:
2226#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002227 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002229 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2231#endif
2232 break;
2233 case PyUnicode_4BYTE_KIND:
2234#if SIZEOF_WCHAR_T == 2
2235 /* This is the only case which has to process surrogates, thus
2236 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002237 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238#else
2239 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002240 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241#endif
2242 break;
2243 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002244 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002247 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248}
2249
Alexander Belopolsky40018472011-02-26 01:02:56 +00002250PyObject *
2251PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002252{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002253 if (size < 0) {
2254 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002255 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002256 return NULL;
2257 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002258 if (u != NULL)
2259 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2260 else
2261 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002262}
2263
Alexander Belopolsky40018472011-02-26 01:02:56 +00002264PyObject *
2265PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002266{
2267 size_t size = strlen(u);
2268 if (size > PY_SSIZE_T_MAX) {
2269 PyErr_SetString(PyExc_OverflowError, "input too long");
2270 return NULL;
2271 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002272 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002273}
2274
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002275PyObject *
2276_PyUnicode_FromId(_Py_Identifier *id)
2277{
2278 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002279 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2280 strlen(id->string),
2281 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002282 if (!id->object)
2283 return NULL;
2284 PyUnicode_InternInPlace(&id->object);
2285 assert(!id->next);
2286 id->next = static_strings;
2287 static_strings = id;
2288 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002289 return id->object;
2290}
2291
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002292static void
2293unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002294{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002295 _Py_Identifier *tmp, *s = static_strings;
2296 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002297 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002298 tmp = s->next;
2299 s->next = NULL;
2300 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002301 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002302 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002303}
2304
Benjamin Peterson0df54292012-03-26 14:50:32 -04002305/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002306
Victor Stinnerd3f08822012-05-29 12:57:52 +02002307PyObject*
2308_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002309{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002310 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002311 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002312 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002313#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002314 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002315#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002316 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002317 }
Victor Stinner785938e2011-12-11 20:09:03 +01002318 unicode = PyUnicode_New(size, 127);
2319 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002320 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002321 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2322 assert(_PyUnicode_CheckConsistency(unicode, 1));
2323 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002324}
2325
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002326static Py_UCS4
2327kind_maxchar_limit(unsigned int kind)
2328{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002329 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002330 case PyUnicode_1BYTE_KIND:
2331 return 0x80;
2332 case PyUnicode_2BYTE_KIND:
2333 return 0x100;
2334 case PyUnicode_4BYTE_KIND:
2335 return 0x10000;
2336 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002337 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002338 }
2339}
2340
Victor Stinner702c7342011-10-05 13:50:52 +02002341static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002342_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002343{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002344 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002345 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002346
Serhiy Storchaka678db842013-01-26 12:16:36 +02002347 if (size == 0)
2348 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002349 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002350 if (size == 1)
2351 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002352
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002353 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002354 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002355 if (!res)
2356 return NULL;
2357 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002358 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002360}
2361
Victor Stinnere57b1c02011-09-28 22:20:48 +02002362static PyObject*
2363_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364{
2365 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002366 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002367
Serhiy Storchaka678db842013-01-26 12:16:36 +02002368 if (size == 0)
2369 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002370 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002371 if (size == 1)
2372 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002373
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002374 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002375 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 if (!res)
2377 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002378 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002380 else {
2381 _PyUnicode_CONVERT_BYTES(
2382 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002384 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 return res;
2386}
2387
Victor Stinnere57b1c02011-09-28 22:20:48 +02002388static PyObject*
2389_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390{
2391 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002392 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002393
Serhiy Storchaka678db842013-01-26 12:16:36 +02002394 if (size == 0)
2395 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002396 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002397 if (size == 1)
2398 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002399
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002400 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002401 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 if (!res)
2403 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002404 if (max_char < 256)
2405 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2406 PyUnicode_1BYTE_DATA(res));
2407 else if (max_char < 0x10000)
2408 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2409 PyUnicode_2BYTE_DATA(res));
2410 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002412 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return res;
2414}
2415
2416PyObject*
2417PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2418{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002419 if (size < 0) {
2420 PyErr_SetString(PyExc_ValueError, "size must be positive");
2421 return NULL;
2422 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002423 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002425 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002427 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002429 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002430 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002431 PyErr_SetString(PyExc_SystemError, "invalid kind");
2432 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434}
2435
Victor Stinnerece58de2012-04-23 23:36:38 +02002436Py_UCS4
2437_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2438{
2439 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002440 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002441
2442 assert(PyUnicode_IS_READY(unicode));
2443 assert(0 <= start);
2444 assert(end <= PyUnicode_GET_LENGTH(unicode));
2445 assert(start <= end);
2446
2447 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2448 return PyUnicode_MAX_CHAR_VALUE(unicode);
2449
2450 if (start == end)
2451 return 127;
2452
Victor Stinner94d558b2012-04-27 22:26:58 +02002453 if (PyUnicode_IS_ASCII(unicode))
2454 return 127;
2455
Victor Stinnerece58de2012-04-23 23:36:38 +02002456 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002457 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002458 endptr = (char *)startptr + end * kind;
2459 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002460 switch(kind) {
2461 case PyUnicode_1BYTE_KIND:
2462 return ucs1lib_find_max_char(startptr, endptr);
2463 case PyUnicode_2BYTE_KIND:
2464 return ucs2lib_find_max_char(startptr, endptr);
2465 case PyUnicode_4BYTE_KIND:
2466 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002467 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002468 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002469 }
2470}
2471
Victor Stinner25a4b292011-10-06 12:31:55 +02002472/* Ensure that a string uses the most efficient storage, if it is not the
2473 case: create a new string with of the right kind. Write NULL into *p_unicode
2474 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002475static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002476unicode_adjust_maxchar(PyObject **p_unicode)
2477{
2478 PyObject *unicode, *copy;
2479 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002480 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002481 unsigned int kind;
2482
2483 assert(p_unicode != NULL);
2484 unicode = *p_unicode;
2485 assert(PyUnicode_IS_READY(unicode));
2486 if (PyUnicode_IS_ASCII(unicode))
2487 return;
2488
2489 len = PyUnicode_GET_LENGTH(unicode);
2490 kind = PyUnicode_KIND(unicode);
2491 if (kind == PyUnicode_1BYTE_KIND) {
2492 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002493 max_char = ucs1lib_find_max_char(u, u + len);
2494 if (max_char >= 128)
2495 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002496 }
2497 else if (kind == PyUnicode_2BYTE_KIND) {
2498 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002499 max_char = ucs2lib_find_max_char(u, u + len);
2500 if (max_char >= 256)
2501 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002502 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002503 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002504 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002505 max_char = ucs4lib_find_max_char(u, u + len);
2506 if (max_char >= 0x10000)
2507 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002508 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002509 else
2510 Py_UNREACHABLE();
2511
Victor Stinner25a4b292011-10-06 12:31:55 +02002512 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002513 if (copy != NULL)
2514 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002515 Py_DECREF(unicode);
2516 *p_unicode = copy;
2517}
2518
Victor Stinner034f6cf2011-09-30 02:26:44 +02002519PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002520_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002521{
Victor Stinner87af4f22011-11-21 23:03:47 +01002522 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002523 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002524
Victor Stinner034f6cf2011-09-30 02:26:44 +02002525 if (!PyUnicode_Check(unicode)) {
2526 PyErr_BadInternalCall();
2527 return NULL;
2528 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002529 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002530 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002531
Victor Stinner87af4f22011-11-21 23:03:47 +01002532 length = PyUnicode_GET_LENGTH(unicode);
2533 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002534 if (!copy)
2535 return NULL;
2536 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2537
Christian Heimesf051e432016-09-13 20:22:02 +02002538 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002539 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002540 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002541 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002542}
2543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002544
Victor Stinnerbc603d12011-10-02 01:00:40 +02002545/* Widen Unicode objects to larger buffers. Don't write terminating null
2546 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002547
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002548static void*
2549unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002550{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002551 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002552
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002553 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002554 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002555 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002556 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002557 if (!result)
2558 return PyErr_NoMemory();
2559 assert(skind == PyUnicode_1BYTE_KIND);
2560 _PyUnicode_CONVERT_BYTES(
2561 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002562 (const Py_UCS1 *)data,
2563 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002564 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002566 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002567 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002568 if (!result)
2569 return PyErr_NoMemory();
2570 if (skind == PyUnicode_2BYTE_KIND) {
2571 _PyUnicode_CONVERT_BYTES(
2572 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002573 (const Py_UCS2 *)data,
2574 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002575 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002577 else {
2578 assert(skind == PyUnicode_1BYTE_KIND);
2579 _PyUnicode_CONVERT_BYTES(
2580 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002581 (const Py_UCS1 *)data,
2582 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002583 result);
2584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002585 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002586 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002587 Py_UNREACHABLE();
2588 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590}
2591
2592static Py_UCS4*
2593as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2594 int copy_null)
2595{
2596 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002597 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 Py_ssize_t len, targetlen;
2599 if (PyUnicode_READY(string) == -1)
2600 return NULL;
2601 kind = PyUnicode_KIND(string);
2602 data = PyUnicode_DATA(string);
2603 len = PyUnicode_GET_LENGTH(string);
2604 targetlen = len;
2605 if (copy_null)
2606 targetlen++;
2607 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002608 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 if (!target) {
2610 PyErr_NoMemory();
2611 return NULL;
2612 }
2613 }
2614 else {
2615 if (targetsize < targetlen) {
2616 PyErr_Format(PyExc_SystemError,
2617 "string is longer than the buffer");
2618 if (copy_null && 0 < targetsize)
2619 target[0] = 0;
2620 return NULL;
2621 }
2622 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002623 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002624 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002625 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002627 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002628 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002629 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2630 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002631 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002632 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002633 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002634 else {
2635 Py_UNREACHABLE();
2636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 if (copy_null)
2638 target[len] = 0;
2639 return target;
2640}
2641
2642Py_UCS4*
2643PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2644 int copy_null)
2645{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002646 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002647 PyErr_BadInternalCall();
2648 return NULL;
2649 }
2650 return as_ucs4(string, target, targetsize, copy_null);
2651}
2652
2653Py_UCS4*
2654PyUnicode_AsUCS4Copy(PyObject *string)
2655{
2656 return as_ucs4(string, NULL, 0, 1);
2657}
2658
Victor Stinner15a11362012-10-06 23:48:20 +02002659/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002660 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2661 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2662#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002663
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002664static int
2665unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2666 Py_ssize_t width, Py_ssize_t precision)
2667{
2668 Py_ssize_t length, fill, arglen;
2669 Py_UCS4 maxchar;
2670
2671 if (PyUnicode_READY(str) == -1)
2672 return -1;
2673
2674 length = PyUnicode_GET_LENGTH(str);
2675 if ((precision == -1 || precision >= length)
2676 && width <= length)
2677 return _PyUnicodeWriter_WriteStr(writer, str);
2678
2679 if (precision != -1)
2680 length = Py_MIN(precision, length);
2681
2682 arglen = Py_MAX(length, width);
2683 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2684 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2685 else
2686 maxchar = writer->maxchar;
2687
2688 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2689 return -1;
2690
2691 if (width > length) {
2692 fill = width - length;
2693 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2694 return -1;
2695 writer->pos += fill;
2696 }
2697
2698 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2699 str, 0, length);
2700 writer->pos += length;
2701 return 0;
2702}
2703
2704static int
Victor Stinner998b8062018-09-12 00:23:25 +02002705unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002706 Py_ssize_t width, Py_ssize_t precision)
2707{
2708 /* UTF-8 */
2709 Py_ssize_t length;
2710 PyObject *unicode;
2711 int res;
2712
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002713 if (precision == -1) {
2714 length = strlen(str);
2715 }
2716 else {
2717 length = 0;
2718 while (length < precision && str[length]) {
2719 length++;
2720 }
2721 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002722 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2723 if (unicode == NULL)
2724 return -1;
2725
2726 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2727 Py_DECREF(unicode);
2728 return res;
2729}
2730
Victor Stinner96865452011-03-01 23:44:09 +00002731static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002732unicode_fromformat_arg(_PyUnicodeWriter *writer,
2733 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002734{
Victor Stinnere215d962012-10-06 23:03:36 +02002735 const char *p;
2736 Py_ssize_t len;
2737 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002738 Py_ssize_t width;
2739 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 int longflag;
2741 int longlongflag;
2742 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002743 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002744
2745 p = f;
2746 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002747 zeropad = 0;
2748 if (*f == '0') {
2749 zeropad = 1;
2750 f++;
2751 }
Victor Stinner96865452011-03-01 23:44:09 +00002752
2753 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002754 width = -1;
2755 if (Py_ISDIGIT((unsigned)*f)) {
2756 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002757 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002758 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002759 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002760 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002761 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002762 return NULL;
2763 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002764 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002765 f++;
2766 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002767 }
2768 precision = -1;
2769 if (*f == '.') {
2770 f++;
2771 if (Py_ISDIGIT((unsigned)*f)) {
2772 precision = (*f - '0');
2773 f++;
2774 while (Py_ISDIGIT((unsigned)*f)) {
2775 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2776 PyErr_SetString(PyExc_ValueError,
2777 "precision too big");
2778 return NULL;
2779 }
2780 precision = (precision * 10) + (*f - '0');
2781 f++;
2782 }
2783 }
Victor Stinner96865452011-03-01 23:44:09 +00002784 if (*f == '%') {
2785 /* "%.3%s" => f points to "3" */
2786 f--;
2787 }
2788 }
2789 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002790 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002791 f--;
2792 }
Victor Stinner96865452011-03-01 23:44:09 +00002793
2794 /* Handle %ld, %lu, %lld and %llu. */
2795 longflag = 0;
2796 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002797 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002798 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002799 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002800 longflag = 1;
2801 ++f;
2802 }
Victor Stinner96865452011-03-01 23:44:09 +00002803 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002804 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002805 longlongflag = 1;
2806 f += 2;
2807 }
Victor Stinner96865452011-03-01 23:44:09 +00002808 }
2809 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002811 size_tflag = 1;
2812 ++f;
2813 }
Victor Stinnere215d962012-10-06 23:03:36 +02002814
2815 if (f[1] == '\0')
2816 writer->overallocate = 0;
2817
2818 switch (*f) {
2819 case 'c':
2820 {
2821 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002822 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002823 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002824 "character argument not in range(0x110000)");
2825 return NULL;
2826 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002827 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002828 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002829 break;
2830 }
2831
2832 case 'i':
2833 case 'd':
2834 case 'u':
2835 case 'x':
2836 {
2837 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002838 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002839 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002840
2841 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002842 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002843 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002844 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002845 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002846 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002847 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002848 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002849 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002850 va_arg(*vargs, size_t));
2851 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002852 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002853 va_arg(*vargs, unsigned int));
2854 }
2855 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002856 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002857 }
2858 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002859 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002860 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002861 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002862 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002863 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002864 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002865 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002866 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002867 va_arg(*vargs, Py_ssize_t));
2868 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002869 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002870 va_arg(*vargs, int));
2871 }
2872 assert(len >= 0);
2873
Victor Stinnere215d962012-10-06 23:03:36 +02002874 if (precision < len)
2875 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002876
2877 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002878 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2879 return NULL;
2880
Victor Stinnere215d962012-10-06 23:03:36 +02002881 if (width > precision) {
2882 Py_UCS4 fillchar;
2883 fill = width - precision;
2884 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002885 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2886 return NULL;
2887 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002888 }
Victor Stinner15a11362012-10-06 23:48:20 +02002889 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002890 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002891 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2892 return NULL;
2893 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002894 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002895
Victor Stinner4a587072013-11-19 12:54:53 +01002896 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2897 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002898 break;
2899 }
2900
2901 case 'p':
2902 {
2903 char number[MAX_LONG_LONG_CHARS];
2904
2905 len = sprintf(number, "%p", va_arg(*vargs, void*));
2906 assert(len >= 0);
2907
2908 /* %p is ill-defined: ensure leading 0x. */
2909 if (number[1] == 'X')
2910 number[1] = 'x';
2911 else if (number[1] != 'x') {
2912 memmove(number + 2, number,
2913 strlen(number) + 1);
2914 number[0] = '0';
2915 number[1] = 'x';
2916 len += 2;
2917 }
2918
Victor Stinner4a587072013-11-19 12:54:53 +01002919 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return NULL;
2921 break;
2922 }
2923
2924 case 's':
2925 {
2926 /* UTF-8 */
2927 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002928 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002929 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002930 break;
2931 }
2932
2933 case 'U':
2934 {
2935 PyObject *obj = va_arg(*vargs, PyObject *);
2936 assert(obj && _PyUnicode_CHECK(obj));
2937
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002938 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002939 return NULL;
2940 break;
2941 }
2942
2943 case 'V':
2944 {
2945 PyObject *obj = va_arg(*vargs, PyObject *);
2946 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002947 if (obj) {
2948 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002949 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002950 return NULL;
2951 }
2952 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002953 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002954 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002955 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002956 }
2957 break;
2958 }
2959
2960 case 'S':
2961 {
2962 PyObject *obj = va_arg(*vargs, PyObject *);
2963 PyObject *str;
2964 assert(obj);
2965 str = PyObject_Str(obj);
2966 if (!str)
2967 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002968 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002969 Py_DECREF(str);
2970 return NULL;
2971 }
2972 Py_DECREF(str);
2973 break;
2974 }
2975
2976 case 'R':
2977 {
2978 PyObject *obj = va_arg(*vargs, PyObject *);
2979 PyObject *repr;
2980 assert(obj);
2981 repr = PyObject_Repr(obj);
2982 if (!repr)
2983 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002984 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002985 Py_DECREF(repr);
2986 return NULL;
2987 }
2988 Py_DECREF(repr);
2989 break;
2990 }
2991
2992 case 'A':
2993 {
2994 PyObject *obj = va_arg(*vargs, PyObject *);
2995 PyObject *ascii;
2996 assert(obj);
2997 ascii = PyObject_ASCII(obj);
2998 if (!ascii)
2999 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003000 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003001 Py_DECREF(ascii);
3002 return NULL;
3003 }
3004 Py_DECREF(ascii);
3005 break;
3006 }
3007
3008 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003009 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003010 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003011 break;
3012
3013 default:
3014 /* if we stumble upon an unknown formatting code, copy the rest
3015 of the format string to the output string. (we cannot just
3016 skip the code, since there's no way to know what's in the
3017 argument list) */
3018 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003019 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003020 return NULL;
3021 f = p+len;
3022 return f;
3023 }
3024
3025 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003026 return f;
3027}
3028
Walter Dörwaldd2034312007-05-18 16:29:38 +00003029PyObject *
3030PyUnicode_FromFormatV(const char *format, va_list vargs)
3031{
Victor Stinnere215d962012-10-06 23:03:36 +02003032 va_list vargs2;
3033 const char *f;
3034 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003035
Victor Stinner8f674cc2013-04-17 23:02:17 +02003036 _PyUnicodeWriter_Init(&writer);
3037 writer.min_length = strlen(format) + 100;
3038 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003039
Benjamin Peterson0c212142016-09-20 20:39:33 -07003040 // Copy varags to be able to pass a reference to a subfunction.
3041 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003042
3043 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003044 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003045 f = unicode_fromformat_arg(&writer, f, &vargs2);
3046 if (f == NULL)
3047 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003049 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003050 const char *p;
3051 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003052
Victor Stinnere215d962012-10-06 23:03:36 +02003053 p = f;
3054 do
3055 {
3056 if ((unsigned char)*p > 127) {
3057 PyErr_Format(PyExc_ValueError,
3058 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3059 "string, got a non-ASCII byte: 0x%02x",
3060 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003061 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003062 }
3063 p++;
3064 }
3065 while (*p != '\0' && *p != '%');
3066 len = p - f;
3067
3068 if (*p == '\0')
3069 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003070
3071 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003072 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003073
3074 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003075 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003076 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003077 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003078 return _PyUnicodeWriter_Finish(&writer);
3079
3080 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003081 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003082 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003083 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003084}
3085
Walter Dörwaldd2034312007-05-18 16:29:38 +00003086PyObject *
3087PyUnicode_FromFormat(const char *format, ...)
3088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003089 PyObject* ret;
3090 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003091
3092#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003093 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003094#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003095 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003096#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003097 ret = PyUnicode_FromFormatV(format, vargs);
3098 va_end(vargs);
3099 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003100}
3101
Serhiy Storchakac46db922018-10-23 22:58:24 +03003102static Py_ssize_t
3103unicode_get_widechar_size(PyObject *unicode)
3104{
3105 Py_ssize_t res;
3106
3107 assert(unicode != NULL);
3108 assert(_PyUnicode_CHECK(unicode));
3109
3110 if (_PyUnicode_WSTR(unicode) != NULL) {
3111 return PyUnicode_WSTR_LENGTH(unicode);
3112 }
3113 assert(PyUnicode_IS_READY(unicode));
3114
3115 res = _PyUnicode_LENGTH(unicode);
3116#if SIZEOF_WCHAR_T == 2
3117 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3118 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3119 const Py_UCS4 *end = s + res;
3120 for (; s < end; ++s) {
3121 if (*s > 0xFFFF) {
3122 ++res;
3123 }
3124 }
3125 }
3126#endif
3127 return res;
3128}
3129
3130static void
3131unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3132{
3133 const wchar_t *wstr;
3134
3135 assert(unicode != NULL);
3136 assert(_PyUnicode_CHECK(unicode));
3137
3138 wstr = _PyUnicode_WSTR(unicode);
3139 if (wstr != NULL) {
3140 memcpy(w, wstr, size * sizeof(wchar_t));
3141 return;
3142 }
3143 assert(PyUnicode_IS_READY(unicode));
3144
3145 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3146 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3147 for (; size--; ++s, ++w) {
3148 *w = *s;
3149 }
3150 }
3151 else {
3152#if SIZEOF_WCHAR_T == 4
3153 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3154 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3155 for (; size--; ++s, ++w) {
3156 *w = *s;
3157 }
3158#else
3159 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3160 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3161 for (; size--; ++s, ++w) {
3162 Py_UCS4 ch = *s;
3163 if (ch > 0xFFFF) {
3164 assert(ch <= MAX_UNICODE);
3165 /* encode surrogate pair in this case */
3166 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3167 if (!size--)
3168 break;
3169 *w = Py_UNICODE_LOW_SURROGATE(ch);
3170 }
3171 else {
3172 *w = ch;
3173 }
3174 }
3175#endif
3176 }
3177}
3178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003179#ifdef HAVE_WCHAR_H
3180
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003181/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003182
Victor Stinnerd88d9832011-09-06 02:00:05 +02003183 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003184 character) required to convert the unicode object. Ignore size argument.
3185
Victor Stinnerd88d9832011-09-06 02:00:05 +02003186 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003187 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003188 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003189Py_ssize_t
3190PyUnicode_AsWideChar(PyObject *unicode,
3191 wchar_t *w,
3192 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003193{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003194 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003195
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003196 if (unicode == NULL) {
3197 PyErr_BadInternalCall();
3198 return -1;
3199 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003200 if (!PyUnicode_Check(unicode)) {
3201 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003202 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003203 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003204
3205 res = unicode_get_widechar_size(unicode);
3206 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003207 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003208 }
3209
3210 if (size > res) {
3211 size = res + 1;
3212 }
3213 else {
3214 res = size;
3215 }
3216 unicode_copy_as_widechar(unicode, w, size);
3217 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003218}
3219
Victor Stinner137c34c2010-09-29 10:25:54 +00003220wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003221PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003222 Py_ssize_t *size)
3223{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003224 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003225 Py_ssize_t buflen;
3226
3227 if (unicode == NULL) {
3228 PyErr_BadInternalCall();
3229 return NULL;
3230 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003231 if (!PyUnicode_Check(unicode)) {
3232 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003233 return NULL;
3234 }
3235
Serhiy Storchakac46db922018-10-23 22:58:24 +03003236 buflen = unicode_get_widechar_size(unicode);
3237 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003238 if (buffer == NULL) {
3239 PyErr_NoMemory();
3240 return NULL;
3241 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003242 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3243 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003244 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003245 }
3246 else if (wcslen(buffer) != (size_t)buflen) {
3247 PyMem_FREE(buffer);
3248 PyErr_SetString(PyExc_ValueError,
3249 "embedded null character");
3250 return NULL;
3251 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003252 return buffer;
3253}
3254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003255#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256
Alexander Belopolsky40018472011-02-26 01:02:56 +00003257PyObject *
3258PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003259{
Victor Stinner8faf8212011-12-08 22:14:11 +01003260 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 PyErr_SetString(PyExc_ValueError,
3262 "chr() arg not in range(0x110000)");
3263 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003264 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003265
Victor Stinner985a82a2014-01-03 12:53:47 +01003266 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003267}
3268
Alexander Belopolsky40018472011-02-26 01:02:56 +00003269PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003270PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003272 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003273 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003274 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003275 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003276 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003277 Py_INCREF(obj);
3278 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003279 }
3280 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 /* For a Unicode subtype that's not a Unicode object,
3282 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003283 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003284 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003285 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003286 "Can't convert '%.100s' object to str implicitly",
3287 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003288 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003289}
3290
Alexander Belopolsky40018472011-02-26 01:02:56 +00003291PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003292PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003293 const char *encoding,
3294 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003295{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003296 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003297 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003298
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003300 PyErr_BadInternalCall();
3301 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003303
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003304 /* Decoding bytes objects is the most common case and should be fast */
3305 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003306 if (PyBytes_GET_SIZE(obj) == 0) {
3307 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3308 return NULL;
3309 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003310 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003311 }
3312 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003313 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3314 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003315 }
3316
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003317 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003318 PyErr_SetString(PyExc_TypeError,
3319 "decoding str is not supported");
3320 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003321 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003322
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003323 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3324 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3325 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003326 "decoding to str: need a bytes-like object, %.80s found",
3327 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003328 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003329 }
Tim Petersced69f82003-09-16 20:30:58 +00003330
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003331 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003332 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003333 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3334 return NULL;
3335 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003336 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003338
Serhiy Storchaka05997252013-01-26 12:14:02 +02003339 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003340 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003341 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342}
3343
Victor Stinnerebe17e02016-10-12 13:57:45 +02003344/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3345 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3346 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003347int
3348_Py_normalize_encoding(const char *encoding,
3349 char *lower,
3350 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003352 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003353 char *l;
3354 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003355 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356
Victor Stinner942889a2016-09-05 15:40:10 -07003357 assert(encoding != NULL);
3358
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003359 e = encoding;
3360 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003361 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003362 punct = 0;
3363 while (1) {
3364 char c = *e;
3365 if (c == 0) {
3366 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003367 }
Victor Stinner942889a2016-09-05 15:40:10 -07003368
3369 if (Py_ISALNUM(c) || c == '.') {
3370 if (punct && l != lower) {
3371 if (l == l_end) {
3372 return 0;
3373 }
3374 *l++ = '_';
3375 }
3376 punct = 0;
3377
3378 if (l == l_end) {
3379 return 0;
3380 }
3381 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003382 }
3383 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003384 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003385 }
Victor Stinner942889a2016-09-05 15:40:10 -07003386
3387 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003388 }
3389 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003390 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003391}
3392
Alexander Belopolsky40018472011-02-26 01:02:56 +00003393PyObject *
3394PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003395 Py_ssize_t size,
3396 const char *encoding,
3397 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003398{
3399 PyObject *buffer = NULL, *unicode;
3400 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003401 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3402
Victor Stinner22eb6892019-06-26 00:51:05 +02003403 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3404 return NULL;
3405 }
3406
Victor Stinnered076ed2019-06-26 01:49:32 +02003407 if (size == 0) {
3408 _Py_RETURN_UNICODE_EMPTY();
3409 }
3410
Victor Stinner942889a2016-09-05 15:40:10 -07003411 if (encoding == NULL) {
3412 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3413 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003414
Fred Drakee4315f52000-05-09 19:53:39 +00003415 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003416 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3417 char *lower = buflower;
3418
3419 /* Fast paths */
3420 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3421 lower += 3;
3422 if (*lower == '_') {
3423 /* Match "utf8" and "utf_8" */
3424 lower++;
3425 }
3426
3427 if (lower[0] == '8' && lower[1] == 0) {
3428 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3429 }
3430 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3431 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3432 }
3433 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3434 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3435 }
3436 }
3437 else {
3438 if (strcmp(lower, "ascii") == 0
3439 || strcmp(lower, "us_ascii") == 0) {
3440 return PyUnicode_DecodeASCII(s, size, errors);
3441 }
Steve Dowercc16be82016-09-08 10:35:16 -07003442 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003443 else if (strcmp(lower, "mbcs") == 0) {
3444 return PyUnicode_DecodeMBCS(s, size, errors);
3445 }
3446 #endif
3447 else if (strcmp(lower, "latin1") == 0
3448 || strcmp(lower, "latin_1") == 0
3449 || strcmp(lower, "iso_8859_1") == 0
3450 || strcmp(lower, "iso8859_1") == 0) {
3451 return PyUnicode_DecodeLatin1(s, size, errors);
3452 }
3453 }
Victor Stinner37296e82010-06-10 13:36:23 +00003454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455
3456 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003457 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003458 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003459 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003460 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461 if (buffer == NULL)
3462 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003463 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 if (unicode == NULL)
3465 goto onError;
3466 if (!PyUnicode_Check(unicode)) {
3467 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003468 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003469 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003470 encoding,
3471 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 Py_DECREF(unicode);
3473 goto onError;
3474 }
3475 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003476 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003477
Benjamin Peterson29060642009-01-31 22:14:21 +00003478 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 Py_XDECREF(buffer);
3480 return NULL;
3481}
3482
Alexander Belopolsky40018472011-02-26 01:02:56 +00003483PyObject *
3484PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003485 const char *encoding,
3486 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003487{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003488 if (!PyUnicode_Check(unicode)) {
3489 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003490 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003491 }
3492
Serhiy Storchaka00939072016-10-27 21:05:49 +03003493 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3494 "PyUnicode_AsDecodedObject() is deprecated; "
3495 "use PyCodec_Decode() to decode from str", 1) < 0)
3496 return NULL;
3497
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003498 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003499 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003500
3501 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003502 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003503}
3504
Alexander Belopolsky40018472011-02-26 01:02:56 +00003505PyObject *
3506PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003507 const char *encoding,
3508 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003509{
3510 PyObject *v;
3511
3512 if (!PyUnicode_Check(unicode)) {
3513 PyErr_BadArgument();
3514 goto onError;
3515 }
3516
Serhiy Storchaka00939072016-10-27 21:05:49 +03003517 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3518 "PyUnicode_AsDecodedUnicode() is deprecated; "
3519 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3520 return NULL;
3521
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003522 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003523 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003524
3525 /* Decode via the codec registry */
3526 v = PyCodec_Decode(unicode, encoding, errors);
3527 if (v == NULL)
3528 goto onError;
3529 if (!PyUnicode_Check(v)) {
3530 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003531 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003532 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003533 encoding,
3534 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003535 Py_DECREF(v);
3536 goto onError;
3537 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003538 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003539
Benjamin Peterson29060642009-01-31 22:14:21 +00003540 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003541 return NULL;
3542}
3543
Alexander Belopolsky40018472011-02-26 01:02:56 +00003544PyObject *
3545PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003546 Py_ssize_t size,
3547 const char *encoding,
3548 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549{
3550 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003551
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003552 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3556 Py_DECREF(unicode);
3557 return v;
3558}
3559
Alexander Belopolsky40018472011-02-26 01:02:56 +00003560PyObject *
3561PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003562 const char *encoding,
3563 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003564{
3565 PyObject *v;
3566
3567 if (!PyUnicode_Check(unicode)) {
3568 PyErr_BadArgument();
3569 goto onError;
3570 }
3571
Serhiy Storchaka00939072016-10-27 21:05:49 +03003572 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3573 "PyUnicode_AsEncodedObject() is deprecated; "
3574 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3575 "or PyCodec_Encode() for generic encoding", 1) < 0)
3576 return NULL;
3577
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003578 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003579 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003580
3581 /* Encode via the codec registry */
3582 v = PyCodec_Encode(unicode, encoding, errors);
3583 if (v == NULL)
3584 goto onError;
3585 return v;
3586
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003588 return NULL;
3589}
3590
Victor Stinner1b579672011-12-17 05:47:23 +01003591
Victor Stinner2cba6b82018-01-10 22:46:15 +01003592static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003593unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003594 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003595{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003596 Py_ssize_t wlen;
3597 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3598 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003599 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003600 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003601
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003602 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003603 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003604 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003605 return NULL;
3606 }
3607
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003608 char *str;
3609 size_t error_pos;
3610 const char *reason;
3611 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003612 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003613 PyMem_Free(wstr);
3614
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003615 if (res != 0) {
3616 if (res == -2) {
3617 PyObject *exc;
3618 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3619 "locale", unicode,
3620 (Py_ssize_t)error_pos,
3621 (Py_ssize_t)(error_pos+1),
3622 reason);
3623 if (exc != NULL) {
3624 PyCodec_StrictErrors(exc);
3625 Py_DECREF(exc);
3626 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003627 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003628 else if (res == -3) {
3629 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3630 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003631 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003632 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003633 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003634 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003635 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003636
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003637 PyObject *bytes = PyBytes_FromString(str);
3638 PyMem_RawFree(str);
3639 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003640}
3641
Victor Stinnerad158722010-10-27 00:25:46 +00003642PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003643PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3644{
Victor Stinner709d23d2019-05-02 14:56:30 -04003645 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3646 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003647}
3648
3649PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003650PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003651{
Victor Stinner81a7be32020-04-14 15:14:01 +02003652 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003653 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003654 return unicode_encode_utf8(unicode,
3655 interp->fs_codec.error_handler,
3656 interp->fs_codec.errors);
3657 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003658#ifndef _Py_FORCE_UTF8_FS_ENCODING
3659 else if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003660 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003661 interp->fs_codec.encoding,
3662 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003663 }
Victor Stinnerad158722010-10-27 00:25:46 +00003664#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003665 else {
3666 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3667 machinery is not ready and so cannot be used:
3668 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003669 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3670 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003671 assert(filesystem_errors != NULL);
3672 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3673 assert(errors != _Py_ERROR_UNKNOWN);
3674#ifdef _Py_FORCE_UTF8_FS_ENCODING
3675 return unicode_encode_utf8(unicode, errors, NULL);
3676#else
3677 return unicode_encode_locale(unicode, errors, 0);
3678#endif
3679 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003680}
3681
Alexander Belopolsky40018472011-02-26 01:02:56 +00003682PyObject *
3683PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003684 const char *encoding,
3685 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686{
3687 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003688 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003689
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 if (!PyUnicode_Check(unicode)) {
3691 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003692 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 }
Fred Drakee4315f52000-05-09 19:53:39 +00003694
Victor Stinner22eb6892019-06-26 00:51:05 +02003695 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3696 return NULL;
3697 }
3698
Victor Stinner942889a2016-09-05 15:40:10 -07003699 if (encoding == NULL) {
3700 return _PyUnicode_AsUTF8String(unicode, errors);
3701 }
3702
Fred Drakee4315f52000-05-09 19:53:39 +00003703 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003704 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3705 char *lower = buflower;
3706
3707 /* Fast paths */
3708 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3709 lower += 3;
3710 if (*lower == '_') {
3711 /* Match "utf8" and "utf_8" */
3712 lower++;
3713 }
3714
3715 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003716 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003717 }
3718 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3719 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3720 }
3721 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3722 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3723 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003724 }
Victor Stinner942889a2016-09-05 15:40:10 -07003725 else {
3726 if (strcmp(lower, "ascii") == 0
3727 || strcmp(lower, "us_ascii") == 0) {
3728 return _PyUnicode_AsASCIIString(unicode, errors);
3729 }
Steve Dowercc16be82016-09-08 10:35:16 -07003730#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003731 else if (strcmp(lower, "mbcs") == 0) {
3732 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3733 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003734#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003735 else if (strcmp(lower, "latin1") == 0 ||
3736 strcmp(lower, "latin_1") == 0 ||
3737 strcmp(lower, "iso_8859_1") == 0 ||
3738 strcmp(lower, "iso8859_1") == 0) {
3739 return _PyUnicode_AsLatin1String(unicode, errors);
3740 }
3741 }
Victor Stinner37296e82010-06-10 13:36:23 +00003742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743
3744 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003745 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003747 return NULL;
3748
3749 /* The normal path */
3750 if (PyBytes_Check(v))
3751 return v;
3752
3753 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003754 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003755 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003756 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003757
3758 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003759 "encoder %s returned bytearray instead of bytes; "
3760 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003761 encoding);
3762 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003763 Py_DECREF(v);
3764 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003765 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003766
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003767 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3768 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003769 Py_DECREF(v);
3770 return b;
3771 }
3772
3773 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003774 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003775 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003776 encoding,
3777 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003778 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003779 return NULL;
3780}
3781
Alexander Belopolsky40018472011-02-26 01:02:56 +00003782PyObject *
3783PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003784 const char *encoding,
3785 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003786{
3787 PyObject *v;
3788
3789 if (!PyUnicode_Check(unicode)) {
3790 PyErr_BadArgument();
3791 goto onError;
3792 }
3793
Serhiy Storchaka00939072016-10-27 21:05:49 +03003794 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3795 "PyUnicode_AsEncodedUnicode() is deprecated; "
3796 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3797 return NULL;
3798
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003799 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003800 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003801
3802 /* Encode via the codec registry */
3803 v = PyCodec_Encode(unicode, encoding, errors);
3804 if (v == NULL)
3805 goto onError;
3806 if (!PyUnicode_Check(v)) {
3807 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003808 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003809 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003810 encoding,
3811 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003812 Py_DECREF(v);
3813 goto onError;
3814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003816
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 return NULL;
3819}
3820
Victor Stinner2cba6b82018-01-10 22:46:15 +01003821static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003822unicode_decode_locale(const char *str, Py_ssize_t len,
3823 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003824{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003825 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3826 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003827 return NULL;
3828 }
3829
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003830 wchar_t *wstr;
3831 size_t wlen;
3832 const char *reason;
3833 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003834 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003835 if (res != 0) {
3836 if (res == -2) {
3837 PyObject *exc;
3838 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3839 "locale", str, len,
3840 (Py_ssize_t)wlen,
3841 (Py_ssize_t)(wlen + 1),
3842 reason);
3843 if (exc != NULL) {
3844 PyCodec_StrictErrors(exc);
3845 Py_DECREF(exc);
3846 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003847 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003848 else if (res == -3) {
3849 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3850 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003851 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003852 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003853 }
Victor Stinner2f197072011-12-17 07:08:30 +01003854 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003855 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003856
3857 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3858 PyMem_RawFree(wstr);
3859 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003860}
3861
3862PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003863PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3864 const char *errors)
3865{
Victor Stinner709d23d2019-05-02 14:56:30 -04003866 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3867 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003868}
3869
3870PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003871PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003872{
3873 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003874 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3875 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003876}
3877
3878
3879PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003880PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003881 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003882 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3883}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003884
Christian Heimes5894ba72007-11-04 11:43:14 +00003885PyObject*
3886PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3887{
Victor Stinner81a7be32020-04-14 15:14:01 +02003888 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003889 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003890 return unicode_decode_utf8(s, size,
3891 interp->fs_codec.error_handler,
3892 interp->fs_codec.errors,
3893 NULL);
3894 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003895#ifndef _Py_FORCE_UTF8_FS_ENCODING
3896 else if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003897 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003898 interp->fs_codec.encoding,
3899 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003900 }
Victor Stinnerad158722010-10-27 00:25:46 +00003901#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003902 else {
3903 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3904 machinery is not ready and so cannot be used:
3905 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003906 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3907 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003908 assert(filesystem_errors != NULL);
3909 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3910 assert(errors != _Py_ERROR_UNKNOWN);
3911#ifdef _Py_FORCE_UTF8_FS_ENCODING
3912 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3913#else
3914 return unicode_decode_locale(s, size, errors, 0);
3915#endif
3916 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003917}
3918
Martin v. Löwis011e8422009-05-05 04:43:17 +00003919
3920int
3921PyUnicode_FSConverter(PyObject* arg, void* addr)
3922{
Brett Cannonec6ce872016-09-06 15:50:29 -07003923 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003924 PyObject *output = NULL;
3925 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003926 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003927 if (arg == NULL) {
3928 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003929 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003930 return 1;
3931 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003932 path = PyOS_FSPath(arg);
3933 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003934 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003935 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003936 if (PyBytes_Check(path)) {
3937 output = path;
3938 }
3939 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3940 output = PyUnicode_EncodeFSDefault(path);
3941 Py_DECREF(path);
3942 if (!output) {
3943 return 0;
3944 }
3945 assert(PyBytes_Check(output));
3946 }
3947
Victor Stinner0ea2a462010-04-30 00:22:08 +00003948 size = PyBytes_GET_SIZE(output);
3949 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003950 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003951 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003952 Py_DECREF(output);
3953 return 0;
3954 }
3955 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003956 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003957}
3958
3959
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003960int
3961PyUnicode_FSDecoder(PyObject* arg, void* addr)
3962{
Brett Cannona5711202016-09-06 19:36:01 -07003963 int is_buffer = 0;
3964 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003965 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003966 if (arg == NULL) {
3967 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003968 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003969 return 1;
3970 }
Brett Cannona5711202016-09-06 19:36:01 -07003971
3972 is_buffer = PyObject_CheckBuffer(arg);
3973 if (!is_buffer) {
3974 path = PyOS_FSPath(arg);
3975 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003976 return 0;
3977 }
Brett Cannona5711202016-09-06 19:36:01 -07003978 }
3979 else {
3980 path = arg;
3981 Py_INCREF(arg);
3982 }
3983
3984 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003985 output = path;
3986 }
3987 else if (PyBytes_Check(path) || is_buffer) {
3988 PyObject *path_bytes = NULL;
3989
3990 if (!PyBytes_Check(path) &&
3991 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003992 "path should be string, bytes, or os.PathLike, not %.200s",
3993 Py_TYPE(arg)->tp_name)) {
3994 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003995 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003996 }
3997 path_bytes = PyBytes_FromObject(path);
3998 Py_DECREF(path);
3999 if (!path_bytes) {
4000 return 0;
4001 }
4002 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4003 PyBytes_GET_SIZE(path_bytes));
4004 Py_DECREF(path_bytes);
4005 if (!output) {
4006 return 0;
4007 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004008 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004009 else {
4010 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004011 "path should be string, bytes, or os.PathLike, not %.200s",
4012 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004013 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004014 return 0;
4015 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004016 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004017 Py_DECREF(output);
4018 return 0;
4019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004020 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004021 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004022 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004023 Py_DECREF(output);
4024 return 0;
4025 }
4026 *(PyObject**)addr = output;
4027 return Py_CLEANUP_SUPPORTED;
4028}
4029
4030
Inada Naoki02a4d572020-02-27 13:48:59 +09004031static int unicode_fill_utf8(PyObject *unicode);
4032
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004033const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004035{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004036 if (!PyUnicode_Check(unicode)) {
4037 PyErr_BadArgument();
4038 return NULL;
4039 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004040 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004041 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004043 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004044 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 return NULL;
4046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047 }
4048
4049 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004050 *psize = PyUnicode_UTF8_LENGTH(unicode);
4051 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004052}
4053
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004054const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4058}
4059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060Py_UNICODE *
4061PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 if (!PyUnicode_Check(unicode)) {
4064 PyErr_BadArgument();
4065 return NULL;
4066 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004067 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4068 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004070 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004071 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072
Serhiy Storchakac46db922018-10-23 22:58:24 +03004073 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4074 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4075 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004078 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4079 if (w == NULL) {
4080 PyErr_NoMemory();
4081 return NULL;
4082 }
4083 unicode_copy_as_widechar(unicode, w, wlen + 1);
4084 _PyUnicode_WSTR(unicode) = w;
4085 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4086 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004087 }
4088 }
4089 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004090 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004091 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004092}
4093
Alexander Belopolsky40018472011-02-26 01:02:56 +00004094Py_UNICODE *
4095PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098}
4099
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004100const Py_UNICODE *
4101_PyUnicode_AsUnicode(PyObject *unicode)
4102{
4103 Py_ssize_t size;
4104 const Py_UNICODE *wstr;
4105
4106 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4107 if (wstr && wcslen(wstr) != (size_t)size) {
4108 PyErr_SetString(PyExc_ValueError, "embedded null character");
4109 return NULL;
4110 }
4111 return wstr;
4112}
4113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114
Alexander Belopolsky40018472011-02-26 01:02:56 +00004115Py_ssize_t
4116PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117{
4118 if (!PyUnicode_Check(unicode)) {
4119 PyErr_BadArgument();
4120 goto onError;
4121 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004122 if (_PyUnicode_WSTR(unicode) == NULL) {
4123 if (PyUnicode_AsUnicode(unicode) == NULL)
4124 goto onError;
4125 }
4126 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127
Benjamin Peterson29060642009-01-31 22:14:21 +00004128 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 return -1;
4130}
4131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132Py_ssize_t
4133PyUnicode_GetLength(PyObject *unicode)
4134{
Victor Stinner07621332012-06-16 04:53:46 +02004135 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136 PyErr_BadArgument();
4137 return -1;
4138 }
Victor Stinner07621332012-06-16 04:53:46 +02004139 if (PyUnicode_READY(unicode) == -1)
4140 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004141 return PyUnicode_GET_LENGTH(unicode);
4142}
4143
4144Py_UCS4
4145PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4146{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004147 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004148 int kind;
4149
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004150 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004151 PyErr_BadArgument();
4152 return (Py_UCS4)-1;
4153 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004154 if (PyUnicode_READY(unicode) == -1) {
4155 return (Py_UCS4)-1;
4156 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004157 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004158 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 return (Py_UCS4)-1;
4160 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004161 data = PyUnicode_DATA(unicode);
4162 kind = PyUnicode_KIND(unicode);
4163 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004164}
4165
4166int
4167PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4168{
4169 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004170 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004171 return -1;
4172 }
Victor Stinner488fa492011-12-12 00:01:39 +01004173 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004174 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004175 PyErr_SetString(PyExc_IndexError, "string index out of range");
4176 return -1;
4177 }
Victor Stinner488fa492011-12-12 00:01:39 +01004178 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004179 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004180 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4181 PyErr_SetString(PyExc_ValueError, "character out of range");
4182 return -1;
4183 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004184 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4185 index, ch);
4186 return 0;
4187}
4188
Alexander Belopolsky40018472011-02-26 01:02:56 +00004189const char *
4190PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004191{
Victor Stinner42cb4622010-09-01 19:39:01 +00004192 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004193}
4194
Victor Stinner554f3f02010-06-16 23:33:54 +00004195/* create or adjust a UnicodeDecodeError */
4196static void
4197make_decode_exception(PyObject **exceptionObject,
4198 const char *encoding,
4199 const char *input, Py_ssize_t length,
4200 Py_ssize_t startpos, Py_ssize_t endpos,
4201 const char *reason)
4202{
4203 if (*exceptionObject == NULL) {
4204 *exceptionObject = PyUnicodeDecodeError_Create(
4205 encoding, input, length, startpos, endpos, reason);
4206 }
4207 else {
4208 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4209 goto onError;
4210 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4211 goto onError;
4212 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4213 goto onError;
4214 }
4215 return;
4216
4217onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004218 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004219}
4220
Steve Dowercc16be82016-09-08 10:35:16 -07004221#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004222static int
4223widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4224{
4225 if (newsize > *size) {
4226 wchar_t *newbuf = *buf;
4227 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4228 PyErr_NoMemory();
4229 return -1;
4230 }
4231 *buf = newbuf;
4232 }
4233 *size = newsize;
4234 return 0;
4235}
4236
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237/* error handling callback helper:
4238 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004239 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240 and adjust various state variables.
4241 return 0 on success, -1 on error
4242*/
4243
Alexander Belopolsky40018472011-02-26 01:02:56 +00004244static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004245unicode_decode_call_errorhandler_wchar(
4246 const char *errors, PyObject **errorHandler,
4247 const char *encoding, const char *reason,
4248 const char **input, const char **inend, Py_ssize_t *startinpos,
4249 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004250 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004252 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253
4254 PyObject *restuple = NULL;
4255 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004256 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004257 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004258 Py_ssize_t requiredsize;
4259 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004260 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004261 wchar_t *repwstr;
4262 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263
4264 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 *errorHandler = PyCodec_LookupError(errors);
4266 if (*errorHandler == NULL)
4267 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 }
4269
Victor Stinner554f3f02010-06-16 23:33:54 +00004270 make_decode_exception(exceptionObject,
4271 encoding,
4272 *input, *inend - *input,
4273 *startinpos, *endinpos,
4274 reason);
4275 if (*exceptionObject == NULL)
4276 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277
Petr Viktorinffd97532020-02-11 17:46:57 +01004278 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004280 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004282 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004285 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287
4288 /* Copy back the bytes variables, which might have been modified by the
4289 callback */
4290 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4291 if (!inputobj)
4292 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004293 *input = PyBytes_AS_STRING(inputobj);
4294 insize = PyBytes_GET_SIZE(inputobj);
4295 *inend = *input + insize;
4296 /* we can DECREF safely, as the exception has another reference,
4297 so the object won't go away. */
4298 Py_DECREF(inputobj);
4299
4300 if (newpos<0)
4301 newpos = insize+newpos;
4302 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004303 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004304 goto onError;
4305 }
4306
4307 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4308 if (repwstr == NULL)
4309 goto onError;
4310 /* need more space? (at least enough for what we
4311 have+the replacement+the rest of the string (starting
4312 at the new input position), so we won't have to check space
4313 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004314 requiredsize = *outpos;
4315 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4316 goto overflow;
4317 requiredsize += repwlen;
4318 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4319 goto overflow;
4320 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004321 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004322 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004323 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004324 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004325 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004327 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004329 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331 *endinpos = newpos;
4332 *inptr = *input + newpos;
4333
4334 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004335 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004336 return 0;
4337
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004338 overflow:
4339 PyErr_SetString(PyExc_OverflowError,
4340 "decoded result is too long for a Python string");
4341
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342 onError:
4343 Py_XDECREF(restuple);
4344 return -1;
4345}
Steve Dowercc16be82016-09-08 10:35:16 -07004346#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004347
4348static int
4349unicode_decode_call_errorhandler_writer(
4350 const char *errors, PyObject **errorHandler,
4351 const char *encoding, const char *reason,
4352 const char **input, const char **inend, Py_ssize_t *startinpos,
4353 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4354 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4355{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004356 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004357
4358 PyObject *restuple = NULL;
4359 PyObject *repunicode = NULL;
4360 Py_ssize_t insize;
4361 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004362 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004363 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004364 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004365 int need_to_grow = 0;
4366 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004367
4368 if (*errorHandler == NULL) {
4369 *errorHandler = PyCodec_LookupError(errors);
4370 if (*errorHandler == NULL)
4371 goto onError;
4372 }
4373
4374 make_decode_exception(exceptionObject,
4375 encoding,
4376 *input, *inend - *input,
4377 *startinpos, *endinpos,
4378 reason);
4379 if (*exceptionObject == NULL)
4380 goto onError;
4381
Petr Viktorinffd97532020-02-11 17:46:57 +01004382 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004383 if (restuple == NULL)
4384 goto onError;
4385 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004386 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004387 goto onError;
4388 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004389 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004390 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004391
4392 /* Copy back the bytes variables, which might have been modified by the
4393 callback */
4394 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4395 if (!inputobj)
4396 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004397 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004398 *input = PyBytes_AS_STRING(inputobj);
4399 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004400 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004401 /* we can DECREF safely, as the exception has another reference,
4402 so the object won't go away. */
4403 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004404
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004407 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004408 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004410 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411
Victor Stinner170ca6f2013-04-18 00:25:28 +02004412 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004413 if (replen > 1) {
4414 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004415 need_to_grow = 1;
4416 }
4417 new_inptr = *input + newpos;
4418 if (*inend - new_inptr > remain) {
4419 /* We don't know the decoding algorithm here so we make the worst
4420 assumption that one byte decodes to one unicode character.
4421 If unfortunately one byte could decode to more unicode characters,
4422 the decoder may write out-of-bound then. Is it possible for the
4423 algorithms using this function? */
4424 writer->min_length += *inend - new_inptr - remain;
4425 need_to_grow = 1;
4426 }
4427 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004428 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004429 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004430 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4431 goto onError;
4432 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004433 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004434 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004437 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004440 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004441 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004445 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446}
4447
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004448/* --- UTF-7 Codec -------------------------------------------------------- */
4449
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450/* See RFC2152 for details. We encode conservatively and decode liberally. */
4451
4452/* Three simple macros defining base-64. */
4453
4454/* Is c a base-64 character? */
4455
4456#define IS_BASE64(c) \
4457 (((c) >= 'A' && (c) <= 'Z') || \
4458 ((c) >= 'a' && (c) <= 'z') || \
4459 ((c) >= '0' && (c) <= '9') || \
4460 (c) == '+' || (c) == '/')
4461
4462/* given that c is a base-64 character, what is its base-64 value? */
4463
4464#define FROM_BASE64(c) \
4465 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4466 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4467 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4468 (c) == '+' ? 62 : 63)
4469
4470/* What is the base-64 character of the bottom 6 bits of n? */
4471
4472#define TO_BASE64(n) \
4473 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4474
4475/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4476 * decoded as itself. We are permissive on decoding; the only ASCII
4477 * byte not decoding to itself is the + which begins a base64
4478 * string. */
4479
4480#define DECODE_DIRECT(c) \
4481 ((c) <= 127 && (c) != '+')
4482
4483/* The UTF-7 encoder treats ASCII characters differently according to
4484 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4485 * the above). See RFC2152. This array identifies these different
4486 * sets:
4487 * 0 : "Set D"
4488 * alphanumeric and '(),-./:?
4489 * 1 : "Set O"
4490 * !"#$%&*;<=>@[]^_`{|}
4491 * 2 : "whitespace"
4492 * ht nl cr sp
4493 * 3 : special (must be base64 encoded)
4494 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4495 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004496
Tim Petersced69f82003-09-16 20:30:58 +00004497static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498char utf7_category[128] = {
4499/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4500 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4501/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4502 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4503/* sp ! " # $ % & ' ( ) * + , - . / */
4504 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4505/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4506 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4507/* @ A B C D E F G H I J K L M N O */
4508 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4509/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4510 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4511/* ` a b c d e f g h i j k l m n o */
4512 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4513/* p q r s t u v w x y z { | } ~ del */
4514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515};
4516
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517/* ENCODE_DIRECT: this character should be encoded as itself. The
4518 * answer depends on whether we are encoding set O as itself, and also
4519 * on whether we are encoding whitespace as itself. RFC2152 makes it
4520 * clear that the answers to these questions vary between
4521 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004522
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523#define ENCODE_DIRECT(c, directO, directWS) \
4524 ((c) < 128 && (c) > 0 && \
4525 ((utf7_category[(c)] == 0) || \
4526 (directWS && (utf7_category[(c)] == 2)) || \
4527 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528
Alexander Belopolsky40018472011-02-26 01:02:56 +00004529PyObject *
4530PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004531 Py_ssize_t size,
4532 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004534 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4535}
4536
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537/* The decoder. The only state we preserve is our read position,
4538 * i.e. how many characters we have consumed. So if we end in the
4539 * middle of a shift sequence we have to back off the read position
4540 * and the output to the beginning of the sequence, otherwise we lose
4541 * all the shift state (seen bits, number of bits seen, high
4542 * surrogate). */
4543
Alexander Belopolsky40018472011-02-26 01:02:56 +00004544PyObject *
4545PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004546 Py_ssize_t size,
4547 const char *errors,
4548 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004549{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004551 Py_ssize_t startinpos;
4552 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004553 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004554 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 const char *errmsg = "";
4556 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004557 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 unsigned int base64bits = 0;
4559 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004560 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 PyObject *errorHandler = NULL;
4562 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004564 if (size == 0) {
4565 if (consumed)
4566 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004567 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004568 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004570 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004571 _PyUnicodeWriter_Init(&writer);
4572 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004573
4574 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575 e = s + size;
4576
4577 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004578 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004580 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 if (inShift) { /* in a base-64 section */
4583 if (IS_BASE64(ch)) { /* consume a base-64 character */
4584 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4585 base64bits += 6;
4586 s++;
4587 if (base64bits >= 16) {
4588 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004589 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 base64bits -= 16;
4591 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004592 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 if (surrogate) {
4594 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004595 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4596 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004597 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004598 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004600 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 }
4602 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004603 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004604 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 }
4607 }
Victor Stinner551ac952011-11-29 22:58:13 +01004608 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 /* first surrogate */
4610 surrogate = outCh;
4611 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004613 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 }
4616 }
4617 }
4618 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004620 if (base64bits > 0) { /* left-over bits */
4621 if (base64bits >= 6) {
4622 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004623 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 errmsg = "partial character in shift sequence";
4625 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 else {
4628 /* Some bits remain; they should be zero */
4629 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004630 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004631 errmsg = "non-zero padding bits in shift sequence";
4632 goto utf7Error;
4633 }
4634 }
4635 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004636 if (surrogate && DECODE_DIRECT(ch)) {
4637 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4638 goto onError;
4639 }
4640 surrogate = 0;
4641 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 /* '-' is absorbed; other terminating
4643 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004644 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004645 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646 }
4647 }
4648 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004650 s++; /* consume '+' */
4651 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004653 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004654 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004656 else if (s < e && !IS_BASE64(*s)) {
4657 s++;
4658 errmsg = "ill-formed sequence";
4659 goto utf7Error;
4660 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004663 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004664 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004666 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004667 }
4668 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004670 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004671 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004672 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004673 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 else {
4675 startinpos = s-starts;
4676 s++;
4677 errmsg = "unexpected special character";
4678 goto utf7Error;
4679 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004680 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004683 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 errors, &errorHandler,
4685 "utf7", errmsg,
4686 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004687 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689 }
4690
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 /* end of string */
4692
4693 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4694 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004695 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696 if (surrogate ||
4697 (base64bits >= 6) ||
4698 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004700 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004701 errors, &errorHandler,
4702 "utf7", "unterminated shift sequence",
4703 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004704 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705 goto onError;
4706 if (s < e)
4707 goto restart;
4708 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004709 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710
4711 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004712 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004713 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004714 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004715 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004716 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004717 writer.kind, writer.data, shiftOutStart);
4718 Py_XDECREF(errorHandler);
4719 Py_XDECREF(exc);
4720 _PyUnicodeWriter_Dealloc(&writer);
4721 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004722 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004723 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724 }
4725 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004726 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004728 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004729
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730 Py_XDECREF(errorHandler);
4731 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004732 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 Py_XDECREF(errorHandler);
4736 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004737 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004738 return NULL;
4739}
4740
4741
Alexander Belopolsky40018472011-02-26 01:02:56 +00004742PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004743_PyUnicode_EncodeUTF7(PyObject *str,
4744 int base64SetO,
4745 int base64WhiteSpace,
4746 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004748 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004749 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004750 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004751 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004752 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004753 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004754 unsigned int base64bits = 0;
4755 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004756 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004757 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758
Benjamin Petersonbac79492012-01-14 13:34:47 -05004759 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004760 return NULL;
4761 kind = PyUnicode_KIND(str);
4762 data = PyUnicode_DATA(str);
4763 len = PyUnicode_GET_LENGTH(str);
4764
4765 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004767
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004768 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004769 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004770 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004771 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004772 if (v == NULL)
4773 return NULL;
4774
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004775 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004776 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004777 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004778
Antoine Pitrou244651a2009-05-04 18:56:13 +00004779 if (inShift) {
4780 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4781 /* shifting out */
4782 if (base64bits) { /* output remaining bits */
4783 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4784 base64buffer = 0;
4785 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004786 }
4787 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 /* Characters not in the BASE64 set implicitly unshift the sequence
4789 so no '-' is required, except if the character is itself a '-' */
4790 if (IS_BASE64(ch) || ch == '-') {
4791 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004792 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004793 *out++ = (char) ch;
4794 }
4795 else {
4796 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004797 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004798 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004799 else { /* not in a shift sequence */
4800 if (ch == '+') {
4801 *out++ = '+';
4802 *out++ = '-';
4803 }
4804 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4805 *out++ = (char) ch;
4806 }
4807 else {
4808 *out++ = '+';
4809 inShift = 1;
4810 goto encode_char;
4811 }
4812 }
4813 continue;
4814encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004816 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004817
Antoine Pitrou244651a2009-05-04 18:56:13 +00004818 /* code first surrogate */
4819 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004820 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821 while (base64bits >= 6) {
4822 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4823 base64bits -= 6;
4824 }
4825 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004826 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004828 base64bits += 16;
4829 base64buffer = (base64buffer << 16) | ch;
4830 while (base64bits >= 6) {
4831 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4832 base64bits -= 6;
4833 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004834 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004835 if (base64bits)
4836 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4837 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004838 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004839 if (_PyBytes_Resize(&v, out - start) < 0)
4840 return NULL;
4841 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004842}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004843PyObject *
4844PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4845 Py_ssize_t size,
4846 int base64SetO,
4847 int base64WhiteSpace,
4848 const char *errors)
4849{
4850 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004851 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004852 if (tmp == NULL)
4853 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004854 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004855 base64WhiteSpace, errors);
4856 Py_DECREF(tmp);
4857 return result;
4858}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004859
Antoine Pitrou244651a2009-05-04 18:56:13 +00004860#undef IS_BASE64
4861#undef FROM_BASE64
4862#undef TO_BASE64
4863#undef DECODE_DIRECT
4864#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004865
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866/* --- UTF-8 Codec -------------------------------------------------------- */
4867
Alexander Belopolsky40018472011-02-26 01:02:56 +00004868PyObject *
4869PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004870 Py_ssize_t size,
4871 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872{
Walter Dörwald69652032004-09-07 20:24:22 +00004873 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4874}
4875
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004876#include "stringlib/asciilib.h"
4877#include "stringlib/codecs.h"
4878#include "stringlib/undef.h"
4879
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004880#include "stringlib/ucs1lib.h"
4881#include "stringlib/codecs.h"
4882#include "stringlib/undef.h"
4883
4884#include "stringlib/ucs2lib.h"
4885#include "stringlib/codecs.h"
4886#include "stringlib/undef.h"
4887
4888#include "stringlib/ucs4lib.h"
4889#include "stringlib/codecs.h"
4890#include "stringlib/undef.h"
4891
Antoine Pitrouab868312009-01-10 15:40:25 +00004892/* Mask to quickly check whether a C 'long' contains a
4893 non-ASCII, UTF8-encoded char. */
4894#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004895# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004896#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004897# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004898#else
4899# error C 'long' size should be either 4 or 8!
4900#endif
4901
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902static Py_ssize_t
4903ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004904{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004906 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004907
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004908 /*
4909 * Issue #17237: m68k is a bit different from most architectures in
4910 * that objects do not use "natural alignment" - for example, int and
4911 * long are only aligned at 2-byte boundaries. Therefore the assert()
4912 * won't work; also, tests have shown that skipping the "optimised
4913 * version" will even speed up m68k.
4914 */
4915#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004917 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4918 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004919 /* Fast path, see in STRINGLIB(utf8_decode) for
4920 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004921 /* Help allocation */
4922 const char *_p = p;
4923 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924 while (_p < aligned_end) {
4925 unsigned long value = *(const unsigned long *) _p;
4926 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004928 *((unsigned long *)q) = value;
4929 _p += SIZEOF_LONG;
4930 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004931 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932 p = _p;
4933 while (p < end) {
4934 if ((unsigned char)*p & 0x80)
4935 break;
4936 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004941#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004942 while (p < end) {
4943 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4944 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004945 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004946 /* Help allocation */
4947 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004948 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004949 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 if (value & ASCII_CHAR_MASK)
4951 break;
4952 _p += SIZEOF_LONG;
4953 }
4954 p = _p;
4955 if (_p == end)
4956 break;
4957 }
4958 if ((unsigned char)*p & 0x80)
4959 break;
4960 ++p;
4961 }
4962 memcpy(dest, start, p - start);
4963 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964}
Antoine Pitrouab868312009-01-10 15:40:25 +00004965
Victor Stinner709d23d2019-05-02 14:56:30 -04004966static PyObject *
4967unicode_decode_utf8(const char *s, Py_ssize_t size,
4968 _Py_error_handler error_handler, const char *errors,
4969 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004970{
Victor Stinner785938e2011-12-11 20:09:03 +01004971 if (size == 0) {
4972 if (consumed)
4973 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004974 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004975 }
4976
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004977 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4978 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004979 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004980 *consumed = 1;
4981 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004982 }
4983
Inada Naoki770847a2019-06-24 12:30:24 +09004984 const char *starts = s;
4985 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004986
Inada Naoki770847a2019-06-24 12:30:24 +09004987 // fast path: try ASCII string.
4988 PyObject *u = PyUnicode_New(size, 127);
4989 if (u == NULL) {
4990 return NULL;
4991 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004992 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09004993 if (s == end) {
4994 return u;
4995 }
4996
4997 // Use _PyUnicodeWriter after fast path is failed.
4998 _PyUnicodeWriter writer;
4999 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5000 writer.pos = s - starts;
5001
5002 Py_ssize_t startinpos, endinpos;
5003 const char *errmsg = "";
5004 PyObject *error_handler_obj = NULL;
5005 PyObject *exc = NULL;
5006
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 while (s < end) {
5008 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005010
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005012 if (PyUnicode_IS_ASCII(writer.buffer))
5013 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005014 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005015 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005016 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005017 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005018 } else {
5019 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005020 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 }
5022
5023 switch (ch) {
5024 case 0:
5025 if (s == end || consumed)
5026 goto End;
5027 errmsg = "unexpected end of data";
5028 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005029 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005030 break;
5031 case 1:
5032 errmsg = "invalid start byte";
5033 startinpos = s - starts;
5034 endinpos = startinpos + 1;
5035 break;
5036 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005037 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5038 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5039 {
5040 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005041 goto End;
5042 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005043 /* fall through */
5044 case 3:
5045 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005046 errmsg = "invalid continuation byte";
5047 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005048 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005049 break;
5050 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005051 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005052 goto onError;
5053 continue;
5054 }
5055
Victor Stinner1d65d912015-10-05 13:43:50 +02005056 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005057 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005058
5059 switch (error_handler) {
5060 case _Py_ERROR_IGNORE:
5061 s += (endinpos - startinpos);
5062 break;
5063
5064 case _Py_ERROR_REPLACE:
5065 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5066 goto onError;
5067 s += (endinpos - startinpos);
5068 break;
5069
5070 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005071 {
5072 Py_ssize_t i;
5073
Victor Stinner1d65d912015-10-05 13:43:50 +02005074 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5075 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005076 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005077 ch = (Py_UCS4)(unsigned char)(starts[i]);
5078 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5079 ch + 0xdc00);
5080 writer.pos++;
5081 }
5082 s += (endinpos - startinpos);
5083 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005084 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005085
5086 default:
5087 if (unicode_decode_call_errorhandler_writer(
5088 errors, &error_handler_obj,
5089 "utf-8", errmsg,
5090 &starts, &end, &startinpos, &endinpos, &exc, &s,
5091 &writer))
5092 goto onError;
5093 }
Victor Stinner785938e2011-12-11 20:09:03 +01005094 }
5095
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005096End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005097 if (consumed)
5098 *consumed = s - starts;
5099
Victor Stinner1d65d912015-10-05 13:43:50 +02005100 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005101 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005102 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005103
5104onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005105 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005106 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005107 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005108 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005109}
5110
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005111
Victor Stinner709d23d2019-05-02 14:56:30 -04005112PyObject *
5113PyUnicode_DecodeUTF8Stateful(const char *s,
5114 Py_ssize_t size,
5115 const char *errors,
5116 Py_ssize_t *consumed)
5117{
5118 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5119}
5120
5121
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005122/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5123 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005124
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005125 On success, write a pointer to a newly allocated wide character string into
5126 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5127 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005128
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005129 On memory allocation failure, return -1.
5130
5131 On decoding error (if surrogateescape is zero), return -2. If wlen is
5132 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5133 is not NULL, write the decoding error message into *reason. */
5134int
5135_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005136 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005138 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005139 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005140 wchar_t *unicode;
5141 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005142
Victor Stinner3d4226a2018-08-29 22:21:32 +02005143 int surrogateescape = 0;
5144 int surrogatepass = 0;
5145 switch (errors)
5146 {
5147 case _Py_ERROR_STRICT:
5148 break;
5149 case _Py_ERROR_SURROGATEESCAPE:
5150 surrogateescape = 1;
5151 break;
5152 case _Py_ERROR_SURROGATEPASS:
5153 surrogatepass = 1;
5154 break;
5155 default:
5156 return -3;
5157 }
5158
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005159 /* Note: size will always be longer than the resulting Unicode
5160 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005161 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005162 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005163 }
5164
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005165 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005166 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005167 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005168 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005169
5170 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005171 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005172 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005173 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005174 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005175#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005176 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005177#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005178 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005179#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005180 if (ch > 0xFF) {
5181#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005182 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005183#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005184 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005185 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005186 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5187 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5188#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005189 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005190 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005191 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005192 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005193 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005194
5195 if (surrogateescape) {
5196 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5197 }
5198 else {
5199 /* Is it a valid three-byte code? */
5200 if (surrogatepass
5201 && (e - s) >= 3
5202 && (s[0] & 0xf0) == 0xe0
5203 && (s[1] & 0xc0) == 0x80
5204 && (s[2] & 0xc0) == 0x80)
5205 {
5206 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5207 s += 3;
5208 unicode[outpos++] = ch;
5209 }
5210 else {
5211 PyMem_RawFree(unicode );
5212 if (reason != NULL) {
5213 switch (ch) {
5214 case 0:
5215 *reason = "unexpected end of data";
5216 break;
5217 case 1:
5218 *reason = "invalid start byte";
5219 break;
5220 /* 2, 3, 4 */
5221 default:
5222 *reason = "invalid continuation byte";
5223 break;
5224 }
5225 }
5226 if (wlen != NULL) {
5227 *wlen = s - orig_s;
5228 }
5229 return -2;
5230 }
5231 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005232 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005233 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005234 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005235 if (wlen) {
5236 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005237 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005238 *wstr = unicode;
5239 return 0;
5240}
5241
Victor Stinner5f9cf232019-03-19 01:46:25 +01005242
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005243wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005244_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5245 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005246{
5247 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005248 int res = _Py_DecodeUTF8Ex(arg, arglen,
5249 &wstr, wlen,
5250 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005251 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005252 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5253 assert(res != -3);
5254 if (wlen) {
5255 *wlen = (size_t)res;
5256 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005257 return NULL;
5258 }
5259 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005260}
5261
Antoine Pitrouab868312009-01-10 15:40:25 +00005262
Victor Stinnere47e6982017-12-21 15:45:16 +01005263/* UTF-8 encoder using the surrogateescape error handler .
5264
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005265 On success, return 0 and write the newly allocated character string (use
5266 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005267
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005268 On encoding failure, return -2 and write the position of the invalid
5269 surrogate character into *error_pos (if error_pos is set) and the decoding
5270 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005271
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005272 On memory allocation failure, return -1. */
5273int
5274_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005275 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005276{
5277 const Py_ssize_t max_char_size = 4;
5278 Py_ssize_t len = wcslen(text);
5279
5280 assert(len >= 0);
5281
Victor Stinner3d4226a2018-08-29 22:21:32 +02005282 int surrogateescape = 0;
5283 int surrogatepass = 0;
5284 switch (errors)
5285 {
5286 case _Py_ERROR_STRICT:
5287 break;
5288 case _Py_ERROR_SURROGATEESCAPE:
5289 surrogateescape = 1;
5290 break;
5291 case _Py_ERROR_SURROGATEPASS:
5292 surrogatepass = 1;
5293 break;
5294 default:
5295 return -3;
5296 }
5297
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005298 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5299 return -1;
5300 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005301 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005302 if (raw_malloc) {
5303 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005304 }
5305 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005306 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005307 }
5308 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005309 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005310 }
5311
5312 char *p = bytes;
5313 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005314 for (i = 0; i < len; ) {
5315 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005316 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005317 i++;
5318#if Py_UNICODE_SIZE == 2
5319 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5320 && i < len
5321 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5322 {
5323 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5324 i++;
5325 }
5326#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005327
5328 if (ch < 0x80) {
5329 /* Encode ASCII */
5330 *p++ = (char) ch;
5331
5332 }
5333 else if (ch < 0x0800) {
5334 /* Encode Latin-1 */
5335 *p++ = (char)(0xc0 | (ch >> 6));
5336 *p++ = (char)(0x80 | (ch & 0x3f));
5337 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005338 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005339 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005340 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005341 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005342 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005343 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005344 if (reason != NULL) {
5345 *reason = "encoding error";
5346 }
5347 if (raw_malloc) {
5348 PyMem_RawFree(bytes);
5349 }
5350 else {
5351 PyMem_Free(bytes);
5352 }
5353 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005354 }
5355 *p++ = (char)(ch & 0xff);
5356 }
5357 else if (ch < 0x10000) {
5358 *p++ = (char)(0xe0 | (ch >> 12));
5359 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5360 *p++ = (char)(0x80 | (ch & 0x3f));
5361 }
5362 else { /* ch >= 0x10000 */
5363 assert(ch <= MAX_UNICODE);
5364 /* Encode UCS4 Unicode ordinals */
5365 *p++ = (char)(0xf0 | (ch >> 18));
5366 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5367 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5368 *p++ = (char)(0x80 | (ch & 0x3f));
5369 }
5370 }
5371 *p++ = '\0';
5372
5373 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005374 char *bytes2;
5375 if (raw_malloc) {
5376 bytes2 = PyMem_RawRealloc(bytes, final_size);
5377 }
5378 else {
5379 bytes2 = PyMem_Realloc(bytes, final_size);
5380 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005381 if (bytes2 == NULL) {
5382 if (error_pos != NULL) {
5383 *error_pos = (size_t)-1;
5384 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005385 if (raw_malloc) {
5386 PyMem_RawFree(bytes);
5387 }
5388 else {
5389 PyMem_Free(bytes);
5390 }
5391 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005392 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005393 *str = bytes2;
5394 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005395}
5396
5397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005398/* Primary internal function which creates utf8 encoded bytes objects.
5399
5400 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005401 and allocate exactly as much space needed at the end. Else allocate the
5402 maximum possible needed (4 result bytes per Unicode character), and return
5403 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005404*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005405static PyObject *
5406unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5407 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005409 if (!PyUnicode_Check(unicode)) {
5410 PyErr_BadArgument();
5411 return NULL;
5412 }
5413
5414 if (PyUnicode_READY(unicode) == -1)
5415 return NULL;
5416
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005417 if (PyUnicode_UTF8(unicode))
5418 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5419 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005420
Inada Naoki02a4d572020-02-27 13:48:59 +09005421 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005422 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005423 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5424
5425 _PyBytesWriter writer;
5426 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005427
Benjamin Petersonead6b532011-12-20 17:23:42 -06005428 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005429 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005430 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005431 case PyUnicode_1BYTE_KIND:
5432 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5433 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005434 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5435 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005436 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005437 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5438 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005439 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005440 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5441 break;
Tim Peters602f7402002-04-27 18:03:26 +00005442 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005443
5444 if (end == NULL) {
5445 _PyBytesWriter_Dealloc(&writer);
5446 return NULL;
5447 }
5448 return _PyBytesWriter_Finish(&writer, end);
5449}
5450
5451static int
5452unicode_fill_utf8(PyObject *unicode)
5453{
5454 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5455 assert(!PyUnicode_IS_ASCII(unicode));
5456
5457 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005458 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005459 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5460
5461 _PyBytesWriter writer;
5462 char *end;
5463
5464 switch (kind) {
5465 default:
5466 Py_UNREACHABLE();
5467 case PyUnicode_1BYTE_KIND:
5468 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5469 _Py_ERROR_STRICT, NULL);
5470 break;
5471 case PyUnicode_2BYTE_KIND:
5472 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5473 _Py_ERROR_STRICT, NULL);
5474 break;
5475 case PyUnicode_4BYTE_KIND:
5476 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5477 _Py_ERROR_STRICT, NULL);
5478 break;
5479 }
5480 if (end == NULL) {
5481 _PyBytesWriter_Dealloc(&writer);
5482 return -1;
5483 }
5484
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005485 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005486 PyBytes_AS_STRING(writer.buffer);
5487 Py_ssize_t len = end - start;
5488
5489 char *cache = PyObject_MALLOC(len + 1);
5490 if (cache == NULL) {
5491 _PyBytesWriter_Dealloc(&writer);
5492 PyErr_NoMemory();
5493 return -1;
5494 }
5495 _PyUnicode_UTF8(unicode) = cache;
5496 _PyUnicode_UTF8_LENGTH(unicode) = len;
5497 memcpy(cache, start, len);
5498 cache[len] = '\0';
5499 _PyBytesWriter_Dealloc(&writer);
5500 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501}
5502
Alexander Belopolsky40018472011-02-26 01:02:56 +00005503PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005504_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5505{
5506 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5507}
5508
5509
5510PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005511PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5512 Py_ssize_t size,
5513 const char *errors)
5514{
5515 PyObject *v, *unicode;
5516
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005517 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005518 if (unicode == NULL)
5519 return NULL;
5520 v = _PyUnicode_AsUTF8String(unicode, errors);
5521 Py_DECREF(unicode);
5522 return v;
5523}
5524
5525PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005526PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005528 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529}
5530
Walter Dörwald41980ca2007-08-16 21:55:45 +00005531/* --- UTF-32 Codec ------------------------------------------------------- */
5532
5533PyObject *
5534PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005535 Py_ssize_t size,
5536 const char *errors,
5537 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005538{
5539 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5540}
5541
5542PyObject *
5543PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 Py_ssize_t size,
5545 const char *errors,
5546 int *byteorder,
5547 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005548{
5549 const char *starts = s;
5550 Py_ssize_t startinpos;
5551 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005552 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005553 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005554 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005556 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005557 PyObject *errorHandler = NULL;
5558 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005559
Andy Lestere6be9b52020-02-11 20:28:35 -06005560 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005561 e = q + size;
5562
5563 if (byteorder)
5564 bo = *byteorder;
5565
5566 /* Check for BOM marks (U+FEFF) in the input and adjust current
5567 byte order setting accordingly. In native mode, the leading BOM
5568 mark is skipped, in all other modes, it is copied to the output
5569 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005570 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005571 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005572 if (bom == 0x0000FEFF) {
5573 bo = -1;
5574 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005575 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005576 else if (bom == 0xFFFE0000) {
5577 bo = 1;
5578 q += 4;
5579 }
5580 if (byteorder)
5581 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005582 }
5583
Victor Stinnere64322e2012-10-30 23:12:47 +01005584 if (q == e) {
5585 if (consumed)
5586 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005587 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005588 }
5589
Victor Stinnere64322e2012-10-30 23:12:47 +01005590#ifdef WORDS_BIGENDIAN
5591 le = bo < 0;
5592#else
5593 le = bo <= 0;
5594#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005595 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005596
Victor Stinner8f674cc2013-04-17 23:02:17 +02005597 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005598 writer.min_length = (e - q + 3) / 4;
5599 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005600 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005601
Victor Stinnere64322e2012-10-30 23:12:47 +01005602 while (1) {
5603 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005604 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005605
Victor Stinnere64322e2012-10-30 23:12:47 +01005606 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005607 enum PyUnicode_Kind kind = writer.kind;
5608 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005609 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005610 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005611 if (le) {
5612 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005613 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005614 if (ch > maxch)
5615 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005616 if (kind != PyUnicode_1BYTE_KIND &&
5617 Py_UNICODE_IS_SURROGATE(ch))
5618 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005619 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005620 q += 4;
5621 } while (q <= last);
5622 }
5623 else {
5624 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005625 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005626 if (ch > maxch)
5627 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005628 if (kind != PyUnicode_1BYTE_KIND &&
5629 Py_UNICODE_IS_SURROGATE(ch))
5630 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005631 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005632 q += 4;
5633 } while (q <= last);
5634 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005635 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005636 }
5637
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005638 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005639 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005640 startinpos = ((const char *)q) - starts;
5641 endinpos = startinpos + 4;
5642 }
5643 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005644 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005646 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005648 startinpos = ((const char *)q) - starts;
5649 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005651 else {
5652 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005653 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005654 goto onError;
5655 q += 4;
5656 continue;
5657 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005658 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005659 startinpos = ((const char *)q) - starts;
5660 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005662
5663 /* The remaining input chars are ignored if the callback
5664 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005665 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005667 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005669 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005671 }
5672
Walter Dörwald41980ca2007-08-16 21:55:45 +00005673 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005675
Walter Dörwald41980ca2007-08-16 21:55:45 +00005676 Py_XDECREF(errorHandler);
5677 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005678 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005679
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005681 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005682 Py_XDECREF(errorHandler);
5683 Py_XDECREF(exc);
5684 return NULL;
5685}
5686
5687PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005688_PyUnicode_EncodeUTF32(PyObject *str,
5689 const char *errors,
5690 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005691{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005692 enum PyUnicode_Kind kind;
5693 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005694 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005695 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005696 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005697#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005698 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005699#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005700 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005701#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005702 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005703 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005704 PyObject *errorHandler = NULL;
5705 PyObject *exc = NULL;
5706 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005707
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005708 if (!PyUnicode_Check(str)) {
5709 PyErr_BadArgument();
5710 return NULL;
5711 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005712 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005713 return NULL;
5714 kind = PyUnicode_KIND(str);
5715 data = PyUnicode_DATA(str);
5716 len = PyUnicode_GET_LENGTH(str);
5717
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005718 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005719 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005720 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005721 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005722 if (v == NULL)
5723 return NULL;
5724
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005725 /* output buffer is 4-bytes aligned */
5726 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005727 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005728 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005729 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005731 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005732
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005733 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005734 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005735 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005736 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005737 else
5738 encoding = "utf-32";
5739
5740 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005741 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5742 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005743 }
5744
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005745 pos = 0;
5746 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005747 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005748
5749 if (kind == PyUnicode_2BYTE_KIND) {
5750 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5751 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005752 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005753 else {
5754 assert(kind == PyUnicode_4BYTE_KIND);
5755 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5756 &out, native_ordering);
5757 }
5758 if (pos == len)
5759 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005760
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005761 rep = unicode_encode_call_errorhandler(
5762 errors, &errorHandler,
5763 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005764 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 if (!rep)
5766 goto error;
5767
5768 if (PyBytes_Check(rep)) {
5769 repsize = PyBytes_GET_SIZE(rep);
5770 if (repsize & 3) {
5771 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005772 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005773 "surrogates not allowed");
5774 goto error;
5775 }
5776 moreunits = repsize / 4;
5777 }
5778 else {
5779 assert(PyUnicode_Check(rep));
5780 if (PyUnicode_READY(rep) < 0)
5781 goto error;
5782 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5783 if (!PyUnicode_IS_ASCII(rep)) {
5784 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005785 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005786 "surrogates not allowed");
5787 goto error;
5788 }
5789 }
5790
5791 /* four bytes are reserved for each surrogate */
5792 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005793 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005794 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005795 /* integer overflow */
5796 PyErr_NoMemory();
5797 goto error;
5798 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005799 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005800 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005801 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005802 }
5803
5804 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005805 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005806 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005807 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005808 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005809 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5810 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005811 }
5812
5813 Py_CLEAR(rep);
5814 }
5815
5816 /* Cut back to size actually needed. This is necessary for, for example,
5817 encoding of a string containing isolated surrogates and the 'ignore'
5818 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005819 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005820 if (nsize != PyBytes_GET_SIZE(v))
5821 _PyBytes_Resize(&v, nsize);
5822 Py_XDECREF(errorHandler);
5823 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005824 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005825 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005826 error:
5827 Py_XDECREF(rep);
5828 Py_XDECREF(errorHandler);
5829 Py_XDECREF(exc);
5830 Py_XDECREF(v);
5831 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005832}
5833
Alexander Belopolsky40018472011-02-26 01:02:56 +00005834PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005835PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5836 Py_ssize_t size,
5837 const char *errors,
5838 int byteorder)
5839{
5840 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005841 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005842 if (tmp == NULL)
5843 return NULL;
5844 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5845 Py_DECREF(tmp);
5846 return result;
5847}
5848
5849PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005850PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005851{
Victor Stinnerb960b342011-11-20 19:12:52 +01005852 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005853}
5854
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855/* --- UTF-16 Codec ------------------------------------------------------- */
5856
Tim Peters772747b2001-08-09 22:21:55 +00005857PyObject *
5858PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 Py_ssize_t size,
5860 const char *errors,
5861 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862{
Walter Dörwald69652032004-09-07 20:24:22 +00005863 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5864}
5865
5866PyObject *
5867PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 Py_ssize_t size,
5869 const char *errors,
5870 int *byteorder,
5871 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005872{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005873 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005874 Py_ssize_t startinpos;
5875 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005876 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005877 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005878 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005879 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005880 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005881 PyObject *errorHandler = NULL;
5882 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005883 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884
Andy Lestere6be9b52020-02-11 20:28:35 -06005885 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005886 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887
5888 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005889 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005891 /* Check for BOM marks (U+FEFF) in the input and adjust current
5892 byte order setting accordingly. In native mode, the leading BOM
5893 mark is skipped, in all other modes, it is copied to the output
5894 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005895 if (bo == 0 && size >= 2) {
5896 const Py_UCS4 bom = (q[1] << 8) | q[0];
5897 if (bom == 0xFEFF) {
5898 q += 2;
5899 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005901 else if (bom == 0xFFFE) {
5902 q += 2;
5903 bo = 1;
5904 }
5905 if (byteorder)
5906 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908
Antoine Pitrou63065d72012-05-15 23:48:04 +02005909 if (q == e) {
5910 if (consumed)
5911 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005912 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005913 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005914
Christian Heimes743e0cd2012-10-17 23:52:17 +02005915#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005916 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005917 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005918#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005919 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005920 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005921#endif
Tim Peters772747b2001-08-09 22:21:55 +00005922
Antoine Pitrou63065d72012-05-15 23:48:04 +02005923 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005924 character count normally. Error handler will take care of
5925 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005926 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005927 writer.min_length = (e - q + 1) / 2;
5928 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005929 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005930
Antoine Pitrou63065d72012-05-15 23:48:04 +02005931 while (1) {
5932 Py_UCS4 ch = 0;
5933 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005934 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005935 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005936 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005937 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005938 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005939 native_ordering);
5940 else
5941 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005942 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005943 native_ordering);
5944 } else if (kind == PyUnicode_2BYTE_KIND) {
5945 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005946 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005947 native_ordering);
5948 } else {
5949 assert(kind == PyUnicode_4BYTE_KIND);
5950 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005951 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005952 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005953 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005954 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005955
Antoine Pitrou63065d72012-05-15 23:48:04 +02005956 switch (ch)
5957 {
5958 case 0:
5959 /* remaining byte at the end? (size should be even) */
5960 if (q == e || consumed)
5961 goto End;
5962 errmsg = "truncated data";
5963 startinpos = ((const char *)q) - starts;
5964 endinpos = ((const char *)e) - starts;
5965 break;
5966 /* The remaining input chars are ignored if the callback
5967 chooses to skip the input */
5968 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005969 q -= 2;
5970 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005971 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005972 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005973 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005974 endinpos = ((const char *)e) - starts;
5975 break;
5976 case 2:
5977 errmsg = "illegal encoding";
5978 startinpos = ((const char *)q) - 2 - starts;
5979 endinpos = startinpos + 2;
5980 break;
5981 case 3:
5982 errmsg = "illegal UTF-16 surrogate";
5983 startinpos = ((const char *)q) - 4 - starts;
5984 endinpos = startinpos + 2;
5985 break;
5986 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005987 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005988 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 continue;
5990 }
5991
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005992 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005993 errors,
5994 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005995 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005996 &starts,
5997 (const char **)&e,
5998 &startinpos,
5999 &endinpos,
6000 &exc,
6001 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006002 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 }
6005
Antoine Pitrou63065d72012-05-15 23:48:04 +02006006End:
Walter Dörwald69652032004-09-07 20:24:22 +00006007 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006009
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 Py_XDECREF(errorHandler);
6011 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006012 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006015 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 Py_XDECREF(errorHandler);
6017 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 return NULL;
6019}
6020
Tim Peters772747b2001-08-09 22:21:55 +00006021PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006022_PyUnicode_EncodeUTF16(PyObject *str,
6023 const char *errors,
6024 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006026 enum PyUnicode_Kind kind;
6027 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006028 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006029 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006030 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006031 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006032#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006033 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006034#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006035 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006036#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006037 const char *encoding;
6038 Py_ssize_t nsize, pos;
6039 PyObject *errorHandler = NULL;
6040 PyObject *exc = NULL;
6041 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006042
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006043 if (!PyUnicode_Check(str)) {
6044 PyErr_BadArgument();
6045 return NULL;
6046 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006047 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006048 return NULL;
6049 kind = PyUnicode_KIND(str);
6050 data = PyUnicode_DATA(str);
6051 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006052
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006053 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006054 if (kind == PyUnicode_4BYTE_KIND) {
6055 const Py_UCS4 *in = (const Py_UCS4 *)data;
6056 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006057 while (in < end) {
6058 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006059 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006060 }
6061 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006062 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006063 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006065 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006066 nsize = len + pairs + (byteorder == 0);
6067 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006068 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006072 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006073 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006074 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006075 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006076 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006077 }
6078 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006079 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006080 }
Tim Peters772747b2001-08-09 22:21:55 +00006081
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006082 if (kind == PyUnicode_1BYTE_KIND) {
6083 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6084 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006085 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006086
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006087 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006088 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006089 }
6090 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006091 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006092 }
6093 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006094 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006095 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006096
6097 pos = 0;
6098 while (pos < len) {
6099 Py_ssize_t repsize, moreunits;
6100
6101 if (kind == PyUnicode_2BYTE_KIND) {
6102 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6103 &out, native_ordering);
6104 }
6105 else {
6106 assert(kind == PyUnicode_4BYTE_KIND);
6107 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6108 &out, native_ordering);
6109 }
6110 if (pos == len)
6111 break;
6112
6113 rep = unicode_encode_call_errorhandler(
6114 errors, &errorHandler,
6115 encoding, "surrogates not allowed",
6116 str, &exc, pos, pos + 1, &pos);
6117 if (!rep)
6118 goto error;
6119
6120 if (PyBytes_Check(rep)) {
6121 repsize = PyBytes_GET_SIZE(rep);
6122 if (repsize & 1) {
6123 raise_encode_exception(&exc, encoding,
6124 str, pos - 1, pos,
6125 "surrogates not allowed");
6126 goto error;
6127 }
6128 moreunits = repsize / 2;
6129 }
6130 else {
6131 assert(PyUnicode_Check(rep));
6132 if (PyUnicode_READY(rep) < 0)
6133 goto error;
6134 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6135 if (!PyUnicode_IS_ASCII(rep)) {
6136 raise_encode_exception(&exc, encoding,
6137 str, pos - 1, pos,
6138 "surrogates not allowed");
6139 goto error;
6140 }
6141 }
6142
6143 /* two bytes are reserved for each surrogate */
6144 if (moreunits > 1) {
6145 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006146 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006147 /* integer overflow */
6148 PyErr_NoMemory();
6149 goto error;
6150 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006151 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006152 goto error;
6153 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6154 }
6155
6156 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006157 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006158 out += moreunits;
6159 } else /* rep is unicode */ {
6160 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6161 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6162 &out, native_ordering);
6163 }
6164
6165 Py_CLEAR(rep);
6166 }
6167
6168 /* Cut back to size actually needed. This is necessary for, for example,
6169 encoding of a string containing isolated surrogates and the 'ignore' handler
6170 is used. */
6171 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6172 if (nsize != PyBytes_GET_SIZE(v))
6173 _PyBytes_Resize(&v, nsize);
6174 Py_XDECREF(errorHandler);
6175 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006176 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006177 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006178 error:
6179 Py_XDECREF(rep);
6180 Py_XDECREF(errorHandler);
6181 Py_XDECREF(exc);
6182 Py_XDECREF(v);
6183 return NULL;
6184#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185}
6186
Alexander Belopolsky40018472011-02-26 01:02:56 +00006187PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6189 Py_ssize_t size,
6190 const char *errors,
6191 int byteorder)
6192{
6193 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006194 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195 if (tmp == NULL)
6196 return NULL;
6197 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6198 Py_DECREF(tmp);
6199 return result;
6200}
6201
6202PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006203PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006205 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206}
6207
6208/* --- Unicode Escape Codec ----------------------------------------------- */
6209
Fredrik Lundh06d12682001-01-24 07:59:11 +00006210static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006211
Alexander Belopolsky40018472011-02-26 01:02:56 +00006212PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006213_PyUnicode_DecodeUnicodeEscape(const char *s,
6214 Py_ssize_t size,
6215 const char *errors,
6216 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006218 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006219 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006221 PyObject *errorHandler = NULL;
6222 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006223
Eric V. Smith42454af2016-10-31 09:22:08 -04006224 // so we can remember if we've seen an invalid escape char or not
6225 *first_invalid_escape = NULL;
6226
Victor Stinner62ec3312016-09-06 17:04:34 -07006227 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006228 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006229 }
6230 /* Escaped strings will always be longer than the resulting
6231 Unicode string, so we start with size here and then reduce the
6232 length after conversion to the true value.
6233 (but if the error callback returns a long replacement string
6234 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006235 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 writer.min_length = size;
6237 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6238 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006239 }
6240
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 end = s + size;
6242 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006243 unsigned char c = (unsigned char) *s++;
6244 Py_UCS4 ch;
6245 int count;
6246 Py_ssize_t startinpos;
6247 Py_ssize_t endinpos;
6248 const char *message;
6249
6250#define WRITE_ASCII_CHAR(ch) \
6251 do { \
6252 assert(ch <= 127); \
6253 assert(writer.pos < writer.size); \
6254 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6255 } while(0)
6256
6257#define WRITE_CHAR(ch) \
6258 do { \
6259 if (ch <= writer.maxchar) { \
6260 assert(writer.pos < writer.size); \
6261 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6262 } \
6263 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6264 goto onError; \
6265 } \
6266 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267
6268 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006269 if (c != '\\') {
6270 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 continue;
6272 }
6273
Victor Stinner62ec3312016-09-06 17:04:34 -07006274 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006276 if (s >= end) {
6277 message = "\\ at end of string";
6278 goto error;
6279 }
6280 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006281
Victor Stinner62ec3312016-09-06 17:04:34 -07006282 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006283 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006286 case '\n': continue;
6287 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6288 case '\'': WRITE_ASCII_CHAR('\''); continue;
6289 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6290 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006291 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006292 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6293 case 't': WRITE_ASCII_CHAR('\t'); continue;
6294 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6295 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006296 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006297 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006298 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006299 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300
Benjamin Peterson29060642009-01-31 22:14:21 +00006301 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 case '0': case '1': case '2': case '3':
6303 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006305 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006306 ch = (ch<<3) + *s++ - '0';
6307 if (s < end && '0' <= *s && *s <= '7') {
6308 ch = (ch<<3) + *s++ - '0';
6309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 WRITE_CHAR(ch);
6312 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 /* hex escapes */
6315 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006318 message = "truncated \\xXX escape";
6319 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006323 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006324 message = "truncated \\uXXXX escape";
6325 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006328 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006329 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006330 message = "truncated \\UXXXXXXXX escape";
6331 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006333 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006334 ch <<= 4;
6335 if (c >= '0' && c <= '9') {
6336 ch += c - '0';
6337 }
6338 else if (c >= 'a' && c <= 'f') {
6339 ch += c - ('a' - 10);
6340 }
6341 else if (c >= 'A' && c <= 'F') {
6342 ch += c - ('A' - 10);
6343 }
6344 else {
6345 break;
6346 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006347 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006348 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006349 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 }
6351
6352 /* when we get here, ch is a 32-bit unicode character */
6353 if (ch > MAX_UNICODE) {
6354 message = "illegal Unicode character";
6355 goto error;
6356 }
6357
6358 WRITE_CHAR(ch);
6359 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006360
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006362 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006363 if (ucnhash_CAPI == NULL) {
6364 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006365 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6366 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006367 if (ucnhash_CAPI == NULL) {
6368 PyErr_SetString(
6369 PyExc_UnicodeError,
6370 "\\N escapes not supported (can't load unicodedata module)"
6371 );
6372 goto onError;
6373 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006374 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006375
6376 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006377 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006378 const char *start = ++s;
6379 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006380 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006381 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006382 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006383 namelen = s - start;
6384 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006385 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006386 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006387 ch = 0xffffffff; /* in case 'getcode' messes up */
6388 if (namelen <= INT_MAX &&
6389 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6390 &ch, 0)) {
6391 assert(ch <= MAX_UNICODE);
6392 WRITE_CHAR(ch);
6393 continue;
6394 }
6395 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006396 }
6397 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006398 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006399
6400 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006401 if (*first_invalid_escape == NULL) {
6402 *first_invalid_escape = s-1; /* Back up one char, since we've
6403 already incremented s. */
6404 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006405 WRITE_ASCII_CHAR('\\');
6406 WRITE_CHAR(c);
6407 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006409
6410 error:
6411 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006413 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006414 errors, &errorHandler,
6415 "unicodeescape", message,
6416 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006417 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006418 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006420 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006421
6422#undef WRITE_ASCII_CHAR
6423#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006425
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006426 Py_XDECREF(errorHandler);
6427 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006428 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006429
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006431 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 Py_XDECREF(errorHandler);
6433 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 return NULL;
6435}
6436
Eric V. Smith42454af2016-10-31 09:22:08 -04006437PyObject *
6438PyUnicode_DecodeUnicodeEscape(const char *s,
6439 Py_ssize_t size,
6440 const char *errors)
6441{
6442 const char *first_invalid_escape;
6443 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6444 &first_invalid_escape);
6445 if (result == NULL)
6446 return NULL;
6447 if (first_invalid_escape != NULL) {
6448 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6449 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006450 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006451 Py_DECREF(result);
6452 return NULL;
6453 }
6454 }
6455 return result;
6456}
6457
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006458/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459
Alexander Belopolsky40018472011-02-26 01:02:56 +00006460PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006461PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006463 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006466 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006467 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469
Ezio Melottie7f90372012-10-05 03:33:31 +03006470 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006471 escape.
6472
Ezio Melottie7f90372012-10-05 03:33:31 +03006473 For UCS1 strings it's '\xxx', 4 bytes per source character.
6474 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6475 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006476 */
6477
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006478 if (!PyUnicode_Check(unicode)) {
6479 PyErr_BadArgument();
6480 return NULL;
6481 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006482 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006483 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 }
Victor Stinner358af132015-10-12 22:36:57 +02006485
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006486 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006487 if (len == 0) {
6488 return PyBytes_FromStringAndSize(NULL, 0);
6489 }
6490
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006491 kind = PyUnicode_KIND(unicode);
6492 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006493 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6494 bytes, and 1 byte characters 4. */
6495 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006496 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006497 return PyErr_NoMemory();
6498 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006499 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006500 if (repr == NULL) {
6501 return NULL;
6502 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006503
Victor Stinner62ec3312016-09-06 17:04:34 -07006504 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006505 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006506 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006507
Victor Stinner62ec3312016-09-06 17:04:34 -07006508 /* U+0000-U+00ff range */
6509 if (ch < 0x100) {
6510 if (ch >= ' ' && ch < 127) {
6511 if (ch != '\\') {
6512 /* Copy printable US ASCII as-is */
6513 *p++ = (char) ch;
6514 }
6515 /* Escape backslashes */
6516 else {
6517 *p++ = '\\';
6518 *p++ = '\\';
6519 }
6520 }
Victor Stinner358af132015-10-12 22:36:57 +02006521
Victor Stinner62ec3312016-09-06 17:04:34 -07006522 /* Map special whitespace to '\t', \n', '\r' */
6523 else if (ch == '\t') {
6524 *p++ = '\\';
6525 *p++ = 't';
6526 }
6527 else if (ch == '\n') {
6528 *p++ = '\\';
6529 *p++ = 'n';
6530 }
6531 else if (ch == '\r') {
6532 *p++ = '\\';
6533 *p++ = 'r';
6534 }
6535
6536 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6537 else {
6538 *p++ = '\\';
6539 *p++ = 'x';
6540 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6541 *p++ = Py_hexdigits[ch & 0x000F];
6542 }
Tim Petersced69f82003-09-16 20:30:58 +00006543 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006544 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006545 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 *p++ = '\\';
6547 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006548 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6549 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6550 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6551 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006553 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6554 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006555
Victor Stinner62ec3312016-09-06 17:04:34 -07006556 /* Make sure that the first two digits are zero */
6557 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006558 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006559 *p++ = 'U';
6560 *p++ = '0';
6561 *p++ = '0';
6562 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6563 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6564 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6565 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6566 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6567 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570
Victor Stinner62ec3312016-09-06 17:04:34 -07006571 assert(p - PyBytes_AS_STRING(repr) > 0);
6572 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6573 return NULL;
6574 }
6575 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576}
6577
Alexander Belopolsky40018472011-02-26 01:02:56 +00006578PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006579PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6580 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006582 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006583 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006584 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006586 }
6587
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006588 result = PyUnicode_AsUnicodeEscapeString(tmp);
6589 Py_DECREF(tmp);
6590 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591}
6592
6593/* --- Raw Unicode Escape Codec ------------------------------------------- */
6594
Alexander Belopolsky40018472011-02-26 01:02:56 +00006595PyObject *
6596PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006597 Py_ssize_t size,
6598 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006601 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603 PyObject *errorHandler = NULL;
6604 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006605
Victor Stinner62ec3312016-09-06 17:04:34 -07006606 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006607 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006608 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006609
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 /* Escaped strings will always be longer than the resulting
6611 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006612 length after conversion to the true value. (But decoding error
6613 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006614 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006615 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006616 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6617 goto onError;
6618 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006619
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 end = s + size;
6621 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006622 unsigned char c = (unsigned char) *s++;
6623 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006624 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006625 Py_ssize_t startinpos;
6626 Py_ssize_t endinpos;
6627 const char *message;
6628
6629#define WRITE_CHAR(ch) \
6630 do { \
6631 if (ch <= writer.maxchar) { \
6632 assert(writer.pos < writer.size); \
6633 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6634 } \
6635 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6636 goto onError; \
6637 } \
6638 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006641 if (c != '\\' || s >= end) {
6642 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006644 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006645
Victor Stinner62ec3312016-09-06 17:04:34 -07006646 c = (unsigned char) *s++;
6647 if (c == 'u') {
6648 count = 4;
6649 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006651 else if (c == 'U') {
6652 count = 8;
6653 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006654 }
6655 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006656 assert(writer.pos < writer.size);
6657 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6658 WRITE_CHAR(c);
6659 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006660 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006661 startinpos = s - starts - 2;
6662
6663 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6664 for (ch = 0; count && s < end; ++s, --count) {
6665 c = (unsigned char)*s;
6666 ch <<= 4;
6667 if (c >= '0' && c <= '9') {
6668 ch += c - '0';
6669 }
6670 else if (c >= 'a' && c <= 'f') {
6671 ch += c - ('a' - 10);
6672 }
6673 else if (c >= 'A' && c <= 'F') {
6674 ch += c - ('A' - 10);
6675 }
6676 else {
6677 break;
6678 }
6679 }
6680 if (!count) {
6681 if (ch <= MAX_UNICODE) {
6682 WRITE_CHAR(ch);
6683 continue;
6684 }
6685 message = "\\Uxxxxxxxx out of range";
6686 }
6687
6688 endinpos = s-starts;
6689 writer.min_length = end - s + writer.pos;
6690 if (unicode_decode_call_errorhandler_writer(
6691 errors, &errorHandler,
6692 "rawunicodeescape", message,
6693 &starts, &end, &startinpos, &endinpos, &exc, &s,
6694 &writer)) {
6695 goto onError;
6696 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006697 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006698
6699#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 Py_XDECREF(errorHandler);
6702 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006703 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006704
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006706 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 Py_XDECREF(errorHandler);
6708 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006710
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711}
6712
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006713
Alexander Belopolsky40018472011-02-26 01:02:56 +00006714PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006715PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716{
Victor Stinner62ec3312016-09-06 17:04:34 -07006717 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006719 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006720 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006721 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006722 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006724 if (!PyUnicode_Check(unicode)) {
6725 PyErr_BadArgument();
6726 return NULL;
6727 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006728 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006729 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006730 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006731 kind = PyUnicode_KIND(unicode);
6732 data = PyUnicode_DATA(unicode);
6733 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006734 if (kind == PyUnicode_1BYTE_KIND) {
6735 return PyBytes_FromStringAndSize(data, len);
6736 }
Victor Stinner0e368262011-11-10 20:12:49 +01006737
Victor Stinner62ec3312016-09-06 17:04:34 -07006738 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6739 bytes, and 1 byte characters 4. */
6740 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006741
Victor Stinner62ec3312016-09-06 17:04:34 -07006742 if (len > PY_SSIZE_T_MAX / expandsize) {
6743 return PyErr_NoMemory();
6744 }
6745 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6746 if (repr == NULL) {
6747 return NULL;
6748 }
6749 if (len == 0) {
6750 return repr;
6751 }
6752
6753 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006754 for (pos = 0; pos < len; pos++) {
6755 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006756
Victor Stinner62ec3312016-09-06 17:04:34 -07006757 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6758 if (ch < 0x100) {
6759 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006760 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006761 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006762 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 *p++ = '\\';
6764 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006765 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6766 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6767 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6768 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006770 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6771 else {
6772 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6773 *p++ = '\\';
6774 *p++ = 'U';
6775 *p++ = '0';
6776 *p++ = '0';
6777 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6778 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6779 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6780 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6781 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6782 *p++ = Py_hexdigits[ch & 15];
6783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006785
Victor Stinner62ec3312016-09-06 17:04:34 -07006786 assert(p > PyBytes_AS_STRING(repr));
6787 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6788 return NULL;
6789 }
6790 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791}
6792
Alexander Belopolsky40018472011-02-26 01:02:56 +00006793PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006794PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6795 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006797 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006798 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006799 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006800 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006801 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6802 Py_DECREF(tmp);
6803 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804}
6805
6806/* --- Latin-1 Codec ------------------------------------------------------ */
6807
Alexander Belopolsky40018472011-02-26 01:02:56 +00006808PyObject *
6809PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006810 Py_ssize_t size,
6811 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006814 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815}
6816
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006817/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006818static void
6819make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006820 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006821 PyObject *unicode,
6822 Py_ssize_t startpos, Py_ssize_t endpos,
6823 const char *reason)
6824{
6825 if (*exceptionObject == NULL) {
6826 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006827 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006828 encoding, unicode, startpos, endpos, reason);
6829 }
6830 else {
6831 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6832 goto onError;
6833 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6834 goto onError;
6835 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6836 goto onError;
6837 return;
6838 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006839 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006840 }
6841}
6842
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006843/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006844static void
6845raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006846 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006847 PyObject *unicode,
6848 Py_ssize_t startpos, Py_ssize_t endpos,
6849 const char *reason)
6850{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006851 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006852 encoding, unicode, startpos, endpos, reason);
6853 if (*exceptionObject != NULL)
6854 PyCodec_StrictErrors(*exceptionObject);
6855}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006856
6857/* error handling callback helper:
6858 build arguments, call the callback and check the arguments,
6859 put the result into newpos and return the replacement string, which
6860 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006861static PyObject *
6862unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006863 PyObject **errorHandler,
6864 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006865 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006866 Py_ssize_t startpos, Py_ssize_t endpos,
6867 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006869 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006870 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006871 PyObject *restuple;
6872 PyObject *resunicode;
6873
6874 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006878 }
6879
Benjamin Petersonbac79492012-01-14 13:34:47 -05006880 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006881 return NULL;
6882 len = PyUnicode_GET_LENGTH(unicode);
6883
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006884 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006885 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006888
Petr Viktorinffd97532020-02-11 17:46:57 +01006889 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006892 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006893 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 Py_DECREF(restuple);
6895 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006897 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 &resunicode, newpos)) {
6899 Py_DECREF(restuple);
6900 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006901 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006902 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6903 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6904 Py_DECREF(restuple);
6905 return NULL;
6906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006907 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006908 *newpos = len + *newpos;
6909 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006910 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 Py_DECREF(restuple);
6912 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006913 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914 Py_INCREF(resunicode);
6915 Py_DECREF(restuple);
6916 return resunicode;
6917}
6918
Alexander Belopolsky40018472011-02-26 01:02:56 +00006919static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006920unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006921 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006922 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006924 /* input state */
6925 Py_ssize_t pos=0, size;
6926 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006927 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928 /* pointer into the output */
6929 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006930 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6931 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006932 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006933 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006934 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006935 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006936 /* output object */
6937 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006938
Benjamin Petersonbac79492012-01-14 13:34:47 -05006939 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006940 return NULL;
6941 size = PyUnicode_GET_LENGTH(unicode);
6942 kind = PyUnicode_KIND(unicode);
6943 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944 /* allocate enough for a simple encoding without
6945 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006946 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006947 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006948
6949 _PyBytesWriter_Init(&writer);
6950 str = _PyBytesWriter_Alloc(&writer, size);
6951 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006952 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006953
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006954 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006955 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006956
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006958 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006960 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006961 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006962 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006963 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006964 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006966 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006967 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006969
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006970 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006972
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006973 /* Only overallocate the buffer if it's not the last write */
6974 writer.overallocate = (collend < size);
6975
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006977 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006978 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006979
6980 switch (error_handler) {
6981 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006982 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006984
6985 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006986 memset(str, '?', collend - collstart);
6987 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006988 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006989 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006990 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 break;
Victor Stinner50149202015-09-22 00:26:54 +02006992
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006993 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006994 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006995 writer.min_size -= (collend - collstart);
6996 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006997 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006998 if (str == NULL)
6999 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007000 pos = collend;
7001 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007002
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007003 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007004 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007005 writer.min_size -= (collend - collstart);
7006 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007007 unicode, collstart, collend);
7008 if (str == NULL)
7009 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007010 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 break;
Victor Stinner50149202015-09-22 00:26:54 +02007012
Victor Stinnerc3713e92015-09-29 12:32:13 +02007013 case _Py_ERROR_SURROGATEESCAPE:
7014 for (i = collstart; i < collend; ++i) {
7015 ch = PyUnicode_READ(kind, data, i);
7016 if (ch < 0xdc80 || 0xdcff < ch) {
7017 /* Not a UTF-8b surrogate */
7018 break;
7019 }
7020 *str++ = (char)(ch - 0xdc00);
7021 ++pos;
7022 }
7023 if (i >= collend)
7024 break;
7025 collstart = pos;
7026 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007027 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007028
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007030 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7031 encoding, reason, unicode, &exc,
7032 collstart, collend, &newpos);
7033 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007035
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007036 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007037 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007038
Victor Stinner6bd525b2015-10-09 13:10:05 +02007039 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007040 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007041 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007042 PyBytes_AS_STRING(rep),
7043 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007044 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007045 else {
7046 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007047
Victor Stinner6bd525b2015-10-09 13:10:05 +02007048 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007050
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007051 if (limit == 256 ?
7052 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7053 !PyUnicode_IS_ASCII(rep))
7054 {
7055 /* Not all characters are smaller than limit */
7056 raise_encode_exception(&exc, encoding, unicode,
7057 collstart, collend, reason);
7058 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007060 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7061 str = _PyBytesWriter_WriteBytes(&writer, str,
7062 PyUnicode_DATA(rep),
7063 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007065 if (str == NULL)
7066 goto onError;
7067
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007068 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007069 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007070 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007071
7072 /* If overallocation was disabled, ensure that it was the last
7073 write. Otherwise, we missed an optimization */
7074 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007075 }
7076 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007077
Victor Stinner50149202015-09-22 00:26:54 +02007078 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007079 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007080 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007081
7082 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007083 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007084 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007085 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007086 Py_XDECREF(exc);
7087 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007088}
7089
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007090/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007091PyObject *
7092PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007093 Py_ssize_t size,
7094 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007096 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007097 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007098 if (unicode == NULL)
7099 return NULL;
7100 result = unicode_encode_ucs1(unicode, errors, 256);
7101 Py_DECREF(unicode);
7102 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103}
7104
Alexander Belopolsky40018472011-02-26 01:02:56 +00007105PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007106_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107{
7108 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 PyErr_BadArgument();
7110 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007112 if (PyUnicode_READY(unicode) == -1)
7113 return NULL;
7114 /* Fast path: if it is a one-byte string, construct
7115 bytes object directly. */
7116 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7117 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7118 PyUnicode_GET_LENGTH(unicode));
7119 /* Non-Latin-1 characters present. Defer to above function to
7120 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007121 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007122}
7123
7124PyObject*
7125PyUnicode_AsLatin1String(PyObject *unicode)
7126{
7127 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128}
7129
7130/* --- 7-bit ASCII Codec -------------------------------------------------- */
7131
Alexander Belopolsky40018472011-02-26 01:02:56 +00007132PyObject *
7133PyUnicode_DecodeASCII(const char *s,
7134 Py_ssize_t size,
7135 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007137 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007138 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007139 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007140 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007141 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007142
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007144 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007145
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007147 if (size == 1 && (unsigned char)s[0] < 128)
7148 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007149
Inada Naoki770847a2019-06-24 12:30:24 +09007150 // Shortcut for simple case
7151 PyObject *u = PyUnicode_New(size, 127);
7152 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007153 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007154 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007155 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007156 if (outpos == size) {
7157 return u;
7158 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007159
Inada Naoki770847a2019-06-24 12:30:24 +09007160 _PyUnicodeWriter writer;
7161 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007162 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007163
Inada Naoki770847a2019-06-24 12:30:24 +09007164 s += outpos;
7165 int kind = writer.kind;
7166 void *data = writer.data;
7167 Py_ssize_t startinpos, endinpos;
7168
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007169 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007170 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007172 PyUnicode_WRITE(kind, data, writer.pos, c);
7173 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007175 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007177
7178 /* byte outsize range 0x00..0x7f: call the error handler */
7179
7180 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007181 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007182
7183 switch (error_handler)
7184 {
7185 case _Py_ERROR_REPLACE:
7186 case _Py_ERROR_SURROGATEESCAPE:
7187 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007188 but we may switch to UCS2 at the first write */
7189 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7190 goto onError;
7191 kind = writer.kind;
7192 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007193
7194 if (error_handler == _Py_ERROR_REPLACE)
7195 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7196 else
7197 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7198 writer.pos++;
7199 ++s;
7200 break;
7201
7202 case _Py_ERROR_IGNORE:
7203 ++s;
7204 break;
7205
7206 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 startinpos = s-starts;
7208 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007209 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007210 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 "ascii", "ordinal not in range(128)",
7212 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007213 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007215 kind = writer.kind;
7216 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007219 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007220 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007221 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007222
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007224 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007225 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007226 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 return NULL;
7228}
7229
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007230/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007231PyObject *
7232PyUnicode_EncodeASCII(const Py_UNICODE *p,
7233 Py_ssize_t size,
7234 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007236 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007237 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007238 if (unicode == NULL)
7239 return NULL;
7240 result = unicode_encode_ucs1(unicode, errors, 128);
7241 Py_DECREF(unicode);
7242 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243}
7244
Alexander Belopolsky40018472011-02-26 01:02:56 +00007245PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007246_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247{
7248 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007249 PyErr_BadArgument();
7250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007252 if (PyUnicode_READY(unicode) == -1)
7253 return NULL;
7254 /* Fast path: if it is an ASCII-only string, construct bytes object
7255 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007256 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007257 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7258 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007259 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007260}
7261
7262PyObject *
7263PyUnicode_AsASCIIString(PyObject *unicode)
7264{
7265 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266}
7267
Steve Dowercc16be82016-09-08 10:35:16 -07007268#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007269
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007270/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007271
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007272#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007273#define NEED_RETRY
7274#endif
7275
Steve Dower7ebdda02019-08-21 16:22:33 -07007276/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7277 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7278 both cases also and avoids partial characters overrunning the
7279 length limit in MultiByteToWideChar on Windows */
7280#define DECODING_CHUNK_SIZE (INT_MAX/4)
7281
Victor Stinner3a50e702011-10-18 21:21:00 +02007282#ifndef WC_ERR_INVALID_CHARS
7283# define WC_ERR_INVALID_CHARS 0x0080
7284#endif
7285
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007286static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007287code_page_name(UINT code_page, PyObject **obj)
7288{
7289 *obj = NULL;
7290 if (code_page == CP_ACP)
7291 return "mbcs";
7292 if (code_page == CP_UTF7)
7293 return "CP_UTF7";
7294 if (code_page == CP_UTF8)
7295 return "CP_UTF8";
7296
7297 *obj = PyBytes_FromFormat("cp%u", code_page);
7298 if (*obj == NULL)
7299 return NULL;
7300 return PyBytes_AS_STRING(*obj);
7301}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302
Victor Stinner3a50e702011-10-18 21:21:00 +02007303static DWORD
7304decode_code_page_flags(UINT code_page)
7305{
7306 if (code_page == CP_UTF7) {
7307 /* The CP_UTF7 decoder only supports flags=0 */
7308 return 0;
7309 }
7310 else
7311 return MB_ERR_INVALID_CHARS;
7312}
7313
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 * Decode a byte string from a Windows code page into unicode object in strict
7316 * mode.
7317 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007318 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7319 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007321static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007322decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007323 wchar_t **buf,
7324 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007325 const char *in,
7326 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007327{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007328 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007329 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007331
7332 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007334 while ((outsize = MultiByteToWideChar(code_page, flags,
7335 in, insize, NULL, 0)) <= 0)
7336 {
7337 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7338 goto error;
7339 }
7340 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7341 flags = 0;
7342 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007343
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007344 /* Extend a wchar_t* buffer */
7345 Py_ssize_t n = *bufsize; /* Get the current length */
7346 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7347 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007348 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007349 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007350
7351 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007352 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7353 if (outsize <= 0)
7354 goto error;
7355 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007356
Victor Stinner3a50e702011-10-18 21:21:00 +02007357error:
7358 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7359 return -2;
7360 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007361 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007362}
7363
Victor Stinner3a50e702011-10-18 21:21:00 +02007364/*
7365 * Decode a byte string from a code page into unicode object with an error
7366 * handler.
7367 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007368 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 * UnicodeDecodeError exception and returns -1 on error.
7370 */
7371static int
7372decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007373 wchar_t **buf,
7374 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007375 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007376 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007377{
7378 const char *startin = in;
7379 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007380 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 /* Ideally, we should get reason from FormatMessage. This is the Windows
7382 2000 English version of the message. */
7383 const char *reason = "No mapping for the Unicode character exists "
7384 "in the target code page.";
7385 /* each step cannot decode more than 1 character, but a character can be
7386 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007387 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007388 int insize;
7389 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 PyObject *errorHandler = NULL;
7391 PyObject *exc = NULL;
7392 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007393 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 DWORD err;
7395 int ret = -1;
7396
7397 assert(size > 0);
7398
7399 encoding = code_page_name(code_page, &encoding_obj);
7400 if (encoding == NULL)
7401 return -1;
7402
Victor Stinner7d00cc12014-03-17 23:08:06 +01007403 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7405 UnicodeDecodeError. */
7406 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7407 if (exc != NULL) {
7408 PyCodec_StrictErrors(exc);
7409 Py_CLEAR(exc);
7410 }
7411 goto error;
7412 }
7413
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007414 /* Extend a wchar_t* buffer */
7415 Py_ssize_t n = *bufsize; /* Get the current length */
7416 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7417 PyErr_NoMemory();
7418 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007420 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7421 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007423 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007424
7425 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 while (in < endin)
7427 {
7428 /* Decode a character */
7429 insize = 1;
7430 do
7431 {
7432 outsize = MultiByteToWideChar(code_page, flags,
7433 in, insize,
7434 buffer, Py_ARRAY_LENGTH(buffer));
7435 if (outsize > 0)
7436 break;
7437 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007438 if (err == ERROR_INVALID_FLAGS && flags) {
7439 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7440 flags = 0;
7441 continue;
7442 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 if (err != ERROR_NO_UNICODE_TRANSLATION
7444 && err != ERROR_INSUFFICIENT_BUFFER)
7445 {
7446 PyErr_SetFromWindowsErr(0);
7447 goto error;
7448 }
7449 insize++;
7450 }
7451 /* 4=maximum length of a UTF-8 sequence */
7452 while (insize <= 4 && (in + insize) <= endin);
7453
7454 if (outsize <= 0) {
7455 Py_ssize_t startinpos, endinpos, outpos;
7456
Victor Stinner7d00cc12014-03-17 23:08:06 +01007457 /* last character in partial decode? */
7458 if (in + insize >= endin && !final)
7459 break;
7460
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 startinpos = in - startin;
7462 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007463 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007464 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 errors, &errorHandler,
7466 encoding, reason,
7467 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007468 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 {
7470 goto error;
7471 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007472 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 }
7474 else {
7475 in += insize;
7476 memcpy(out, buffer, outsize * sizeof(wchar_t));
7477 out += outsize;
7478 }
7479 }
7480
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007481 /* Shrink the buffer */
7482 assert(out - *buf <= *bufsize);
7483 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007484 /* (in - startin) <= size and size is an int */
7485 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007486
7487error:
7488 Py_XDECREF(encoding_obj);
7489 Py_XDECREF(errorHandler);
7490 Py_XDECREF(exc);
7491 return ret;
7492}
7493
Victor Stinner3a50e702011-10-18 21:21:00 +02007494static PyObject *
7495decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007496 const char *s, Py_ssize_t size,
7497 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007498{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007499 wchar_t *buf = NULL;
7500 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007501 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007502
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 if (code_page < 0) {
7504 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7505 return NULL;
7506 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007507 if (size < 0) {
7508 PyErr_BadInternalCall();
7509 return NULL;
7510 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007511
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007512 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514
Victor Stinner76a31a62011-11-04 00:05:13 +01007515 do
7516 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007517#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007518 if (size > DECODING_CHUNK_SIZE) {
7519 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007520 final = 0;
7521 done = 0;
7522 }
7523 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007524#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007525 {
7526 chunk_size = (int)size;
7527 final = (consumed == NULL);
7528 done = 1;
7529 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007530
Victor Stinner76a31a62011-11-04 00:05:13 +01007531 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007532 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007533 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007534 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007535 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007536
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007537 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007538 s, chunk_size);
7539 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007540 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007541 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007542 errors, final);
7543 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007544
7545 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007546 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007547 return NULL;
7548 }
7549
7550 if (consumed)
7551 *consumed += converted;
7552
7553 s += converted;
7554 size -= converted;
7555 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007556
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007557 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7558 PyMem_Free(buf);
7559 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007560}
7561
Alexander Belopolsky40018472011-02-26 01:02:56 +00007562PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007563PyUnicode_DecodeCodePageStateful(int code_page,
7564 const char *s,
7565 Py_ssize_t size,
7566 const char *errors,
7567 Py_ssize_t *consumed)
7568{
7569 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7570}
7571
7572PyObject *
7573PyUnicode_DecodeMBCSStateful(const char *s,
7574 Py_ssize_t size,
7575 const char *errors,
7576 Py_ssize_t *consumed)
7577{
7578 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7579}
7580
7581PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007582PyUnicode_DecodeMBCS(const char *s,
7583 Py_ssize_t size,
7584 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007585{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007586 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7587}
7588
Victor Stinner3a50e702011-10-18 21:21:00 +02007589static DWORD
7590encode_code_page_flags(UINT code_page, const char *errors)
7591{
7592 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007593 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 }
7595 else if (code_page == CP_UTF7) {
7596 /* CP_UTF7 only supports flags=0 */
7597 return 0;
7598 }
7599 else {
7600 if (errors != NULL && strcmp(errors, "replace") == 0)
7601 return 0;
7602 else
7603 return WC_NO_BEST_FIT_CHARS;
7604 }
7605}
7606
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007607/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007608 * Encode a Unicode string to a Windows code page into a byte string in strict
7609 * mode.
7610 *
7611 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007612 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007613 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007614static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007615encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007616 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007618{
Victor Stinner554f3f02010-06-16 23:33:54 +00007619 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 BOOL *pusedDefaultChar = &usedDefaultChar;
7621 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007622 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007623 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 const DWORD flags = encode_code_page_flags(code_page, NULL);
7625 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007626 /* Create a substring so that we can get the UTF-16 representation
7627 of just the slice under consideration. */
7628 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007629
Martin v. Löwis3d325192011-11-04 18:23:06 +01007630 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007631
Victor Stinner3a50e702011-10-18 21:21:00 +02007632 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007633 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007635 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007636
Victor Stinner2fc507f2011-11-04 20:06:39 +01007637 substring = PyUnicode_Substring(unicode, offset, offset+len);
7638 if (substring == NULL)
7639 return -1;
7640 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7641 if (p == NULL) {
7642 Py_DECREF(substring);
7643 return -1;
7644 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007645 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007646
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007647 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007648 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007649 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 NULL, 0,
7651 NULL, pusedDefaultChar);
7652 if (outsize <= 0)
7653 goto error;
7654 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007655 if (pusedDefaultChar && *pusedDefaultChar) {
7656 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007657 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007658 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007659
Victor Stinner3a50e702011-10-18 21:21:00 +02007660 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007662 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007663 if (*outbytes == NULL) {
7664 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007666 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007668 }
7669 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007671 const Py_ssize_t n = PyBytes_Size(*outbytes);
7672 if (outsize > PY_SSIZE_T_MAX - n) {
7673 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007674 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007676 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007677 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7678 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007679 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007680 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007681 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007682 }
7683
7684 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007685 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007686 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007687 out, outsize,
7688 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007689 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007690 if (outsize <= 0)
7691 goto error;
7692 if (pusedDefaultChar && *pusedDefaultChar)
7693 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007694 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007695
Victor Stinner3a50e702011-10-18 21:21:00 +02007696error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007697 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007698 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7699 return -2;
7700 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007701 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007702}
7703
Victor Stinner3a50e702011-10-18 21:21:00 +02007704/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007705 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007706 * error handler.
7707 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007708 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007709 * -1 on other error.
7710 */
7711static int
7712encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007713 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007714 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007715{
Victor Stinner3a50e702011-10-18 21:21:00 +02007716 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007717 Py_ssize_t pos = unicode_offset;
7718 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007719 /* Ideally, we should get reason from FormatMessage. This is the Windows
7720 2000 English version of the message. */
7721 const char *reason = "invalid character";
7722 /* 4=maximum length of a UTF-8 sequence */
7723 char buffer[4];
7724 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7725 Py_ssize_t outsize;
7726 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007727 PyObject *errorHandler = NULL;
7728 PyObject *exc = NULL;
7729 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007730 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007731 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007732 PyObject *rep;
7733 int ret = -1;
7734
7735 assert(insize > 0);
7736
7737 encoding = code_page_name(code_page, &encoding_obj);
7738 if (encoding == NULL)
7739 return -1;
7740
7741 if (errors == NULL || strcmp(errors, "strict") == 0) {
7742 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7743 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007744 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007745 if (exc != NULL) {
7746 PyCodec_StrictErrors(exc);
7747 Py_DECREF(exc);
7748 }
7749 Py_XDECREF(encoding_obj);
7750 return -1;
7751 }
7752
7753 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7754 pusedDefaultChar = &usedDefaultChar;
7755 else
7756 pusedDefaultChar = NULL;
7757
7758 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7759 PyErr_NoMemory();
7760 goto error;
7761 }
7762 outsize = insize * Py_ARRAY_LENGTH(buffer);
7763
7764 if (*outbytes == NULL) {
7765 /* Create string object */
7766 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7767 if (*outbytes == NULL)
7768 goto error;
7769 out = PyBytes_AS_STRING(*outbytes);
7770 }
7771 else {
7772 /* Extend string object */
7773 Py_ssize_t n = PyBytes_Size(*outbytes);
7774 if (n > PY_SSIZE_T_MAX - outsize) {
7775 PyErr_NoMemory();
7776 goto error;
7777 }
7778 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7779 goto error;
7780 out = PyBytes_AS_STRING(*outbytes) + n;
7781 }
7782
7783 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007784 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007785 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007786 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7787 wchar_t chars[2];
7788 int charsize;
7789 if (ch < 0x10000) {
7790 chars[0] = (wchar_t)ch;
7791 charsize = 1;
7792 }
7793 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007794 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7795 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007796 charsize = 2;
7797 }
7798
Victor Stinner3a50e702011-10-18 21:21:00 +02007799 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007800 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007801 buffer, Py_ARRAY_LENGTH(buffer),
7802 NULL, pusedDefaultChar);
7803 if (outsize > 0) {
7804 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7805 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007806 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007807 memcpy(out, buffer, outsize);
7808 out += outsize;
7809 continue;
7810 }
7811 }
7812 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7813 PyErr_SetFromWindowsErr(0);
7814 goto error;
7815 }
7816
Victor Stinner3a50e702011-10-18 21:21:00 +02007817 rep = unicode_encode_call_errorhandler(
7818 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007819 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007820 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007821 if (rep == NULL)
7822 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007823 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007824
7825 if (PyBytes_Check(rep)) {
7826 outsize = PyBytes_GET_SIZE(rep);
7827 if (outsize != 1) {
7828 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7829 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7830 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7831 Py_DECREF(rep);
7832 goto error;
7833 }
7834 out = PyBytes_AS_STRING(*outbytes) + offset;
7835 }
7836 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7837 out += outsize;
7838 }
7839 else {
7840 Py_ssize_t i;
7841 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007842 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007843
Benjamin Petersonbac79492012-01-14 13:34:47 -05007844 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007845 Py_DECREF(rep);
7846 goto error;
7847 }
7848
7849 outsize = PyUnicode_GET_LENGTH(rep);
7850 if (outsize != 1) {
7851 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7852 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7853 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7854 Py_DECREF(rep);
7855 goto error;
7856 }
7857 out = PyBytes_AS_STRING(*outbytes) + offset;
7858 }
7859 kind = PyUnicode_KIND(rep);
7860 data = PyUnicode_DATA(rep);
7861 for (i=0; i < outsize; i++) {
7862 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7863 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007864 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007865 encoding, unicode,
7866 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007867 "unable to encode error handler result to ASCII");
7868 Py_DECREF(rep);
7869 goto error;
7870 }
7871 *out = (unsigned char)ch;
7872 out++;
7873 }
7874 }
7875 Py_DECREF(rep);
7876 }
7877 /* write a NUL byte */
7878 *out = 0;
7879 outsize = out - PyBytes_AS_STRING(*outbytes);
7880 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7881 if (_PyBytes_Resize(outbytes, outsize) < 0)
7882 goto error;
7883 ret = 0;
7884
7885error:
7886 Py_XDECREF(encoding_obj);
7887 Py_XDECREF(errorHandler);
7888 Py_XDECREF(exc);
7889 return ret;
7890}
7891
Victor Stinner3a50e702011-10-18 21:21:00 +02007892static PyObject *
7893encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007894 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007895 const char *errors)
7896{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007897 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007898 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007899 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007900 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007901
Victor Stinner29dacf22015-01-26 16:41:32 +01007902 if (!PyUnicode_Check(unicode)) {
7903 PyErr_BadArgument();
7904 return NULL;
7905 }
7906
Benjamin Petersonbac79492012-01-14 13:34:47 -05007907 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007908 return NULL;
7909 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007910
Victor Stinner3a50e702011-10-18 21:21:00 +02007911 if (code_page < 0) {
7912 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7913 return NULL;
7914 }
7915
Martin v. Löwis3d325192011-11-04 18:23:06 +01007916 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007917 return PyBytes_FromStringAndSize(NULL, 0);
7918
Victor Stinner7581cef2011-11-03 22:32:33 +01007919 offset = 0;
7920 do
7921 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007922#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007923 if (len > DECODING_CHUNK_SIZE) {
7924 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007925 done = 0;
7926 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007927 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007928#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007929 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007930 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007931 done = 1;
7932 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007933
Victor Stinner76a31a62011-11-04 00:05:13 +01007934 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007935 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007936 errors);
7937 if (ret == -2)
7938 ret = encode_code_page_errors(code_page, &outbytes,
7939 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007940 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007941 if (ret < 0) {
7942 Py_XDECREF(outbytes);
7943 return NULL;
7944 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007945
Victor Stinner7581cef2011-11-03 22:32:33 +01007946 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007947 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007948 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007949
Victor Stinner3a50e702011-10-18 21:21:00 +02007950 return outbytes;
7951}
7952
7953PyObject *
7954PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7955 Py_ssize_t size,
7956 const char *errors)
7957{
Victor Stinner7581cef2011-11-03 22:32:33 +01007958 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007959 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007960 if (unicode == NULL)
7961 return NULL;
7962 res = encode_code_page(CP_ACP, unicode, errors);
7963 Py_DECREF(unicode);
7964 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007965}
7966
7967PyObject *
7968PyUnicode_EncodeCodePage(int code_page,
7969 PyObject *unicode,
7970 const char *errors)
7971{
Victor Stinner7581cef2011-11-03 22:32:33 +01007972 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007973}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007974
Alexander Belopolsky40018472011-02-26 01:02:56 +00007975PyObject *
7976PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007977{
Victor Stinner7581cef2011-11-03 22:32:33 +01007978 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007979}
7980
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007981#undef NEED_RETRY
7982
Steve Dowercc16be82016-09-08 10:35:16 -07007983#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007984
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985/* --- Character Mapping Codec -------------------------------------------- */
7986
Victor Stinnerfb161b12013-04-18 01:44:27 +02007987static int
7988charmap_decode_string(const char *s,
7989 Py_ssize_t size,
7990 PyObject *mapping,
7991 const char *errors,
7992 _PyUnicodeWriter *writer)
7993{
7994 const char *starts = s;
7995 const char *e;
7996 Py_ssize_t startinpos, endinpos;
7997 PyObject *errorHandler = NULL, *exc = NULL;
7998 Py_ssize_t maplen;
7999 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008000 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008001 Py_UCS4 x;
8002 unsigned char ch;
8003
8004 if (PyUnicode_READY(mapping) == -1)
8005 return -1;
8006
8007 maplen = PyUnicode_GET_LENGTH(mapping);
8008 mapdata = PyUnicode_DATA(mapping);
8009 mapkind = PyUnicode_KIND(mapping);
8010
8011 e = s + size;
8012
8013 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8014 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8015 * is disabled in encoding aliases, latin1 is preferred because
8016 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008017 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008018 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8019 Py_UCS4 maxchar = writer->maxchar;
8020
8021 assert (writer->kind == PyUnicode_1BYTE_KIND);
8022 while (s < e) {
8023 ch = *s;
8024 x = mapdata_ucs1[ch];
8025 if (x > maxchar) {
8026 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8027 goto onError;
8028 maxchar = writer->maxchar;
8029 outdata = (Py_UCS1 *)writer->data;
8030 }
8031 outdata[writer->pos] = x;
8032 writer->pos++;
8033 ++s;
8034 }
8035 return 0;
8036 }
8037
8038 while (s < e) {
8039 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8040 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008041 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008042 if (outkind == PyUnicode_1BYTE_KIND) {
8043 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8044 Py_UCS4 maxchar = writer->maxchar;
8045 while (s < e) {
8046 ch = *s;
8047 x = mapdata_ucs2[ch];
8048 if (x > maxchar)
8049 goto Error;
8050 outdata[writer->pos] = x;
8051 writer->pos++;
8052 ++s;
8053 }
8054 break;
8055 }
8056 else if (outkind == PyUnicode_2BYTE_KIND) {
8057 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8058 while (s < e) {
8059 ch = *s;
8060 x = mapdata_ucs2[ch];
8061 if (x == 0xFFFE)
8062 goto Error;
8063 outdata[writer->pos] = x;
8064 writer->pos++;
8065 ++s;
8066 }
8067 break;
8068 }
8069 }
8070 ch = *s;
8071
8072 if (ch < maplen)
8073 x = PyUnicode_READ(mapkind, mapdata, ch);
8074 else
8075 x = 0xfffe; /* invalid value */
8076Error:
8077 if (x == 0xfffe)
8078 {
8079 /* undefined mapping */
8080 startinpos = s-starts;
8081 endinpos = startinpos+1;
8082 if (unicode_decode_call_errorhandler_writer(
8083 errors, &errorHandler,
8084 "charmap", "character maps to <undefined>",
8085 &starts, &e, &startinpos, &endinpos, &exc, &s,
8086 writer)) {
8087 goto onError;
8088 }
8089 continue;
8090 }
8091
8092 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8093 goto onError;
8094 ++s;
8095 }
8096 Py_XDECREF(errorHandler);
8097 Py_XDECREF(exc);
8098 return 0;
8099
8100onError:
8101 Py_XDECREF(errorHandler);
8102 Py_XDECREF(exc);
8103 return -1;
8104}
8105
8106static int
8107charmap_decode_mapping(const char *s,
8108 Py_ssize_t size,
8109 PyObject *mapping,
8110 const char *errors,
8111 _PyUnicodeWriter *writer)
8112{
8113 const char *starts = s;
8114 const char *e;
8115 Py_ssize_t startinpos, endinpos;
8116 PyObject *errorHandler = NULL, *exc = NULL;
8117 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008118 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008119
8120 e = s + size;
8121
8122 while (s < e) {
8123 ch = *s;
8124
8125 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8126 key = PyLong_FromLong((long)ch);
8127 if (key == NULL)
8128 goto onError;
8129
8130 item = PyObject_GetItem(mapping, key);
8131 Py_DECREF(key);
8132 if (item == NULL) {
8133 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8134 /* No mapping found means: mapping is undefined. */
8135 PyErr_Clear();
8136 goto Undefined;
8137 } else
8138 goto onError;
8139 }
8140
8141 /* Apply mapping */
8142 if (item == Py_None)
8143 goto Undefined;
8144 if (PyLong_Check(item)) {
8145 long value = PyLong_AS_LONG(item);
8146 if (value == 0xFFFE)
8147 goto Undefined;
8148 if (value < 0 || value > MAX_UNICODE) {
8149 PyErr_Format(PyExc_TypeError,
8150 "character mapping must be in range(0x%lx)",
8151 (unsigned long)MAX_UNICODE + 1);
8152 goto onError;
8153 }
8154
8155 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8156 goto onError;
8157 }
8158 else if (PyUnicode_Check(item)) {
8159 if (PyUnicode_READY(item) == -1)
8160 goto onError;
8161 if (PyUnicode_GET_LENGTH(item) == 1) {
8162 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8163 if (value == 0xFFFE)
8164 goto Undefined;
8165 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8166 goto onError;
8167 }
8168 else {
8169 writer->overallocate = 1;
8170 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8171 goto onError;
8172 }
8173 }
8174 else {
8175 /* wrong return value */
8176 PyErr_SetString(PyExc_TypeError,
8177 "character mapping must return integer, None or str");
8178 goto onError;
8179 }
8180 Py_CLEAR(item);
8181 ++s;
8182 continue;
8183
8184Undefined:
8185 /* undefined mapping */
8186 Py_CLEAR(item);
8187 startinpos = s-starts;
8188 endinpos = startinpos+1;
8189 if (unicode_decode_call_errorhandler_writer(
8190 errors, &errorHandler,
8191 "charmap", "character maps to <undefined>",
8192 &starts, &e, &startinpos, &endinpos, &exc, &s,
8193 writer)) {
8194 goto onError;
8195 }
8196 }
8197 Py_XDECREF(errorHandler);
8198 Py_XDECREF(exc);
8199 return 0;
8200
8201onError:
8202 Py_XDECREF(item);
8203 Py_XDECREF(errorHandler);
8204 Py_XDECREF(exc);
8205 return -1;
8206}
8207
Alexander Belopolsky40018472011-02-26 01:02:56 +00008208PyObject *
8209PyUnicode_DecodeCharmap(const char *s,
8210 Py_ssize_t size,
8211 PyObject *mapping,
8212 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008214 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008215
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 /* Default to Latin-1 */
8217 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008221 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008222 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008223 writer.min_length = size;
8224 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008226
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008227 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008228 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8229 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008230 }
8231 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008232 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8233 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008235 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008236
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008238 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 return NULL;
8240}
8241
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008242/* Charmap encoding: the lookup table */
8243
Alexander Belopolsky40018472011-02-26 01:02:56 +00008244struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 PyObject_HEAD
8246 unsigned char level1[32];
8247 int count2, count3;
8248 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008249};
8250
8251static PyObject*
8252encoding_map_size(PyObject *obj, PyObject* args)
8253{
8254 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008255 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008257}
8258
8259static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008260 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 PyDoc_STR("Return the size (in bytes) of this object") },
8262 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008263};
8264
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008265static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008266 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 "EncodingMap", /*tp_name*/
8268 sizeof(struct encoding_map), /*tp_basicsize*/
8269 0, /*tp_itemsize*/
8270 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008271 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008272 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 0, /*tp_getattr*/
8274 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008275 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 0, /*tp_repr*/
8277 0, /*tp_as_number*/
8278 0, /*tp_as_sequence*/
8279 0, /*tp_as_mapping*/
8280 0, /*tp_hash*/
8281 0, /*tp_call*/
8282 0, /*tp_str*/
8283 0, /*tp_getattro*/
8284 0, /*tp_setattro*/
8285 0, /*tp_as_buffer*/
8286 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8287 0, /*tp_doc*/
8288 0, /*tp_traverse*/
8289 0, /*tp_clear*/
8290 0, /*tp_richcompare*/
8291 0, /*tp_weaklistoffset*/
8292 0, /*tp_iter*/
8293 0, /*tp_iternext*/
8294 encoding_map_methods, /*tp_methods*/
8295 0, /*tp_members*/
8296 0, /*tp_getset*/
8297 0, /*tp_base*/
8298 0, /*tp_dict*/
8299 0, /*tp_descr_get*/
8300 0, /*tp_descr_set*/
8301 0, /*tp_dictoffset*/
8302 0, /*tp_init*/
8303 0, /*tp_alloc*/
8304 0, /*tp_new*/
8305 0, /*tp_free*/
8306 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307};
8308
8309PyObject*
8310PyUnicode_BuildEncodingMap(PyObject* string)
8311{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312 PyObject *result;
8313 struct encoding_map *mresult;
8314 int i;
8315 int need_dict = 0;
8316 unsigned char level1[32];
8317 unsigned char level2[512];
8318 unsigned char *mlevel1, *mlevel2, *mlevel3;
8319 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008320 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008321 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008322 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008323 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008324
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008325 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008326 PyErr_BadArgument();
8327 return NULL;
8328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 kind = PyUnicode_KIND(string);
8330 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008331 length = PyUnicode_GET_LENGTH(string);
8332 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 memset(level1, 0xFF, sizeof level1);
8334 memset(level2, 0xFF, sizeof level2);
8335
8336 /* If there isn't a one-to-one mapping of NULL to \0,
8337 or if there are non-BMP characters, we need to use
8338 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008340 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008341 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008343 ch = PyUnicode_READ(kind, data, i);
8344 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008345 need_dict = 1;
8346 break;
8347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008349 /* unmapped character */
8350 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 l1 = ch >> 11;
8352 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008353 if (level1[l1] == 0xFF)
8354 level1[l1] = count2++;
8355 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008356 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008357 }
8358
8359 if (count2 >= 0xFF || count3 >= 0xFF)
8360 need_dict = 1;
8361
8362 if (need_dict) {
8363 PyObject *result = PyDict_New();
8364 PyObject *key, *value;
8365 if (!result)
8366 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008367 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008369 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008370 if (!key || !value)
8371 goto failed1;
8372 if (PyDict_SetItem(result, key, value) == -1)
8373 goto failed1;
8374 Py_DECREF(key);
8375 Py_DECREF(value);
8376 }
8377 return result;
8378 failed1:
8379 Py_XDECREF(key);
8380 Py_XDECREF(value);
8381 Py_DECREF(result);
8382 return NULL;
8383 }
8384
8385 /* Create a three-level trie */
8386 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8387 16*count2 + 128*count3 - 1);
8388 if (!result)
8389 return PyErr_NoMemory();
8390 PyObject_Init(result, &EncodingMapType);
8391 mresult = (struct encoding_map*)result;
8392 mresult->count2 = count2;
8393 mresult->count3 = count3;
8394 mlevel1 = mresult->level1;
8395 mlevel2 = mresult->level23;
8396 mlevel3 = mresult->level23 + 16*count2;
8397 memcpy(mlevel1, level1, 32);
8398 memset(mlevel2, 0xFF, 16*count2);
8399 memset(mlevel3, 0, 128*count3);
8400 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008401 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008402 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008403 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8404 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008405 /* unmapped character */
8406 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008407 o1 = ch>>11;
8408 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008409 i2 = 16*mlevel1[o1] + o2;
8410 if (mlevel2[i2] == 0xFF)
8411 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008412 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008413 i3 = 128*mlevel2[i2] + o3;
8414 mlevel3[i3] = i;
8415 }
8416 return result;
8417}
8418
8419static int
Victor Stinner22168992011-11-20 17:09:18 +01008420encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008421{
8422 struct encoding_map *map = (struct encoding_map*)mapping;
8423 int l1 = c>>11;
8424 int l2 = (c>>7) & 0xF;
8425 int l3 = c & 0x7F;
8426 int i;
8427
Victor Stinner22168992011-11-20 17:09:18 +01008428 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008430 if (c == 0)
8431 return 0;
8432 /* level 1*/
8433 i = map->level1[l1];
8434 if (i == 0xFF) {
8435 return -1;
8436 }
8437 /* level 2*/
8438 i = map->level23[16*i+l2];
8439 if (i == 0xFF) {
8440 return -1;
8441 }
8442 /* level 3 */
8443 i = map->level23[16*map->count2 + 128*i + l3];
8444 if (i == 0) {
8445 return -1;
8446 }
8447 return i;
8448}
8449
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008450/* Lookup the character ch in the mapping. If the character
8451 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008452 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008453static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008454charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455{
Christian Heimes217cfd12007-12-02 14:31:20 +00008456 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457 PyObject *x;
8458
8459 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 x = PyObject_GetItem(mapping, w);
8462 Py_DECREF(w);
8463 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8465 /* No mapping found means: mapping is undefined. */
8466 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008467 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 } else
8469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008471 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008473 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 long value = PyLong_AS_LONG(x);
8475 if (value < 0 || value > 255) {
8476 PyErr_SetString(PyExc_TypeError,
8477 "character mapping must be in range(256)");
8478 Py_DECREF(x);
8479 return NULL;
8480 }
8481 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008483 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 /* wrong return value */
8487 PyErr_Format(PyExc_TypeError,
8488 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008489 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 Py_DECREF(x);
8491 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 }
8493}
8494
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008495static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008496charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008497{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008498 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8499 /* exponentially overallocate to minimize reallocations */
8500 if (requiredsize < 2*outsize)
8501 requiredsize = 2*outsize;
8502 if (_PyBytes_Resize(outobj, requiredsize))
8503 return -1;
8504 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008505}
8506
Benjamin Peterson14339b62009-01-31 16:36:08 +00008507typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008509} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008511 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008512 space is available. Return a new reference to the object that
8513 was put in the output buffer, or Py_None, if the mapping was undefined
8514 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008515 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008516static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008517charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008518 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008520 PyObject *rep;
8521 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008522 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523
Andy Lesterdffe4c02020-03-04 07:15:20 -06008524 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008525 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008527 if (res == -1)
8528 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 if (outsize<requiredsize)
8530 if (charmapencode_resize(outobj, outpos, requiredsize))
8531 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008532 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 outstart[(*outpos)++] = (char)res;
8534 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008535 }
8536
8537 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008540 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 Py_DECREF(rep);
8542 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008543 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 if (PyLong_Check(rep)) {
8545 Py_ssize_t requiredsize = *outpos+1;
8546 if (outsize<requiredsize)
8547 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8548 Py_DECREF(rep);
8549 return enc_EXCEPTION;
8550 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008551 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008553 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 else {
8555 const char *repchars = PyBytes_AS_STRING(rep);
8556 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8557 Py_ssize_t requiredsize = *outpos+repsize;
8558 if (outsize<requiredsize)
8559 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8560 Py_DECREF(rep);
8561 return enc_EXCEPTION;
8562 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008563 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 memcpy(outstart + *outpos, repchars, repsize);
8565 *outpos += repsize;
8566 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008568 Py_DECREF(rep);
8569 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570}
8571
8572/* handle an error in PyUnicode_EncodeCharmap
8573 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008574static int
8575charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008576 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008577 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008578 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008579 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008580{
8581 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008582 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008583 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008584 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008585 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008586 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008588 Py_ssize_t collstartpos = *inpos;
8589 Py_ssize_t collendpos = *inpos+1;
8590 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008591 const char *encoding = "charmap";
8592 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008593 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008594 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008595 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596
Benjamin Petersonbac79492012-01-14 13:34:47 -05008597 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008598 return -1;
8599 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600 /* find all unencodable characters */
8601 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008602 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008603 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008604 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008605 val = encoding_map_lookup(ch, mapping);
8606 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 break;
8608 ++collendpos;
8609 continue;
8610 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008611
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008612 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8613 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 if (rep==NULL)
8615 return -1;
8616 else if (rep!=Py_None) {
8617 Py_DECREF(rep);
8618 break;
8619 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008620 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008622 }
8623 /* cache callback name lookup
8624 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008625 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008626 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008627
8628 switch (*error_handler) {
8629 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008630 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008631 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008632
8633 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008634 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 x = charmapencode_output('?', mapping, res, respos);
8636 if (x==enc_EXCEPTION) {
8637 return -1;
8638 }
8639 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008640 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 return -1;
8642 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008643 }
8644 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008645 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008646 *inpos = collendpos;
8647 break;
Victor Stinner50149202015-09-22 00:26:54 +02008648
8649 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008650 /* generate replacement (temporarily (mis)uses p) */
8651 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 char buffer[2+29+1+1];
8653 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008654 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 for (cp = buffer; *cp; ++cp) {
8656 x = charmapencode_output(*cp, mapping, res, respos);
8657 if (x==enc_EXCEPTION)
8658 return -1;
8659 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008660 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 return -1;
8662 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008663 }
8664 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008665 *inpos = collendpos;
8666 break;
Victor Stinner50149202015-09-22 00:26:54 +02008667
Benjamin Peterson14339b62009-01-31 16:36:08 +00008668 default:
Victor Stinner50149202015-09-22 00:26:54 +02008669 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008670 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008672 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008674 if (PyBytes_Check(repunicode)) {
8675 /* Directly copy bytes result to output. */
8676 Py_ssize_t outsize = PyBytes_Size(*res);
8677 Py_ssize_t requiredsize;
8678 repsize = PyBytes_Size(repunicode);
8679 requiredsize = *respos + repsize;
8680 if (requiredsize > outsize)
8681 /* Make room for all additional bytes. */
8682 if (charmapencode_resize(res, respos, requiredsize)) {
8683 Py_DECREF(repunicode);
8684 return -1;
8685 }
8686 memcpy(PyBytes_AsString(*res) + *respos,
8687 PyBytes_AsString(repunicode), repsize);
8688 *respos += repsize;
8689 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008690 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008691 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008692 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008693 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008694 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008695 Py_DECREF(repunicode);
8696 return -1;
8697 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008698 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008699 data = PyUnicode_DATA(repunicode);
8700 kind = PyUnicode_KIND(repunicode);
8701 for (index = 0; index < repsize; index++) {
8702 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8703 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008705 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 return -1;
8707 }
8708 else if (x==enc_FAILED) {
8709 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008710 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 return -1;
8712 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008713 }
8714 *inpos = newpos;
8715 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 }
8717 return 0;
8718}
8719
Alexander Belopolsky40018472011-02-26 01:02:56 +00008720PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008721_PyUnicode_EncodeCharmap(PyObject *unicode,
8722 PyObject *mapping,
8723 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008725 /* output object */
8726 PyObject *res = NULL;
8727 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008728 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008729 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008731 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008732 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008734 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008735 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008736 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737
Benjamin Petersonbac79492012-01-14 13:34:47 -05008738 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008739 return NULL;
8740 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008741 data = PyUnicode_DATA(unicode);
8742 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008743
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744 /* Default to Latin-1 */
8745 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008746 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008748 /* allocate enough for a simple encoding without
8749 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008750 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008751 if (res == NULL)
8752 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008753 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008756 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008757 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008759 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 if (x==enc_EXCEPTION) /* error */
8761 goto onError;
8762 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008763 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008765 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 &res, &respos)) {
8767 goto onError;
8768 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008769 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 else
8771 /* done with this character => adjust input position */
8772 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008776 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008777 if (_PyBytes_Resize(&res, respos) < 0)
8778 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008779
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008780 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008781 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008782 return res;
8783
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785 Py_XDECREF(res);
8786 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008787 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 return NULL;
8789}
8790
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008791/* Deprecated */
8792PyObject *
8793PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8794 Py_ssize_t size,
8795 PyObject *mapping,
8796 const char *errors)
8797{
8798 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008799 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008800 if (unicode == NULL)
8801 return NULL;
8802 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8803 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008804 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008805}
8806
Alexander Belopolsky40018472011-02-26 01:02:56 +00008807PyObject *
8808PyUnicode_AsCharmapString(PyObject *unicode,
8809 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810{
8811 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 PyErr_BadArgument();
8813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008815 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816}
8817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008818/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008819static void
8820make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008822 Py_ssize_t startpos, Py_ssize_t endpos,
8823 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008825 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008826 *exceptionObject = _PyUnicodeTranslateError_Create(
8827 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828 }
8829 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8831 goto onError;
8832 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8833 goto onError;
8834 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8835 goto onError;
8836 return;
8837 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008838 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 }
8840}
8841
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008842/* error handling callback helper:
8843 build arguments, call the callback and check the arguments,
8844 put the result into newpos and return the replacement string, which
8845 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008846static PyObject *
8847unicode_translate_call_errorhandler(const char *errors,
8848 PyObject **errorHandler,
8849 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008851 Py_ssize_t startpos, Py_ssize_t endpos,
8852 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008853{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008854 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008855
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008856 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008857 PyObject *restuple;
8858 PyObject *resunicode;
8859
8860 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008862 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008864 }
8865
8866 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008868 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870
Petr Viktorinffd97532020-02-11 17:46:57 +01008871 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008872 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008874 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008875 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 Py_DECREF(restuple);
8877 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008878 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008879 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 &resunicode, &i_newpos)) {
8881 Py_DECREF(restuple);
8882 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008883 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008884 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008886 else
8887 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008889 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 Py_DECREF(restuple);
8891 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008892 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008893 Py_INCREF(resunicode);
8894 Py_DECREF(restuple);
8895 return resunicode;
8896}
8897
8898/* Lookup the character ch in the mapping and put the result in result,
8899 which must be decrefed by the caller.
8900 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008901static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008903{
Christian Heimes217cfd12007-12-02 14:31:20 +00008904 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008905 PyObject *x;
8906
8907 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008909 x = PyObject_GetItem(mapping, w);
8910 Py_DECREF(w);
8911 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8913 /* No mapping found means: use 1:1 mapping. */
8914 PyErr_Clear();
8915 *result = NULL;
8916 return 0;
8917 } else
8918 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008919 }
8920 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 *result = x;
8922 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008923 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008924 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008926 if (value < 0 || value > MAX_UNICODE) {
8927 PyErr_Format(PyExc_ValueError,
8928 "character mapping must be in range(0x%x)",
8929 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 Py_DECREF(x);
8931 return -1;
8932 }
8933 *result = x;
8934 return 0;
8935 }
8936 else if (PyUnicode_Check(x)) {
8937 *result = x;
8938 return 0;
8939 }
8940 else {
8941 /* wrong return value */
8942 PyErr_SetString(PyExc_TypeError,
8943 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008944 Py_DECREF(x);
8945 return -1;
8946 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008947}
Victor Stinner1194ea02014-04-04 19:37:40 +02008948
8949/* lookup the character, write the result into the writer.
8950 Return 1 if the result was written into the writer, return 0 if the mapping
8951 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008952static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008953charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8954 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008955{
Victor Stinner1194ea02014-04-04 19:37:40 +02008956 PyObject *item;
8957
8958 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008960
8961 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008963 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008966 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008967 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008968
8969 if (item == Py_None) {
8970 Py_DECREF(item);
8971 return 0;
8972 }
8973
8974 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008975 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8976 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8977 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008978 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8979 Py_DECREF(item);
8980 return -1;
8981 }
8982 Py_DECREF(item);
8983 return 1;
8984 }
8985
8986 if (!PyUnicode_Check(item)) {
8987 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008989 }
8990
8991 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8992 Py_DECREF(item);
8993 return -1;
8994 }
8995
8996 Py_DECREF(item);
8997 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008998}
8999
Victor Stinner89a76ab2014-04-05 11:44:04 +02009000static int
9001unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9002 Py_UCS1 *translate)
9003{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009004 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009005 int ret = 0;
9006
Victor Stinner89a76ab2014-04-05 11:44:04 +02009007 if (charmaptranslate_lookup(ch, mapping, &item)) {
9008 return -1;
9009 }
9010
9011 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009012 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009013 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009014 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009015 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009016 /* not found => default to 1:1 mapping */
9017 translate[ch] = ch;
9018 return 1;
9019 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009020 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009021 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009022 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9023 used it */
9024 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009025 /* invalid character or character outside ASCII:
9026 skip the fast translate */
9027 goto exit;
9028 }
9029 translate[ch] = (Py_UCS1)replace;
9030 }
9031 else if (PyUnicode_Check(item)) {
9032 Py_UCS4 replace;
9033
9034 if (PyUnicode_READY(item) == -1) {
9035 Py_DECREF(item);
9036 return -1;
9037 }
9038 if (PyUnicode_GET_LENGTH(item) != 1)
9039 goto exit;
9040
9041 replace = PyUnicode_READ_CHAR(item, 0);
9042 if (replace > 127)
9043 goto exit;
9044 translate[ch] = (Py_UCS1)replace;
9045 }
9046 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009047 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009048 goto exit;
9049 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009050 ret = 1;
9051
Benjamin Peterson1365de72014-04-07 20:15:41 -04009052 exit:
9053 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009054 return ret;
9055}
9056
9057/* Fast path for ascii => ascii translation. Return 1 if the whole string
9058 was translated into writer, return 0 if the input string was partially
9059 translated into writer, raise an exception and return -1 on error. */
9060static int
9061unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009062 _PyUnicodeWriter *writer, int ignore,
9063 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009064{
Victor Stinner872b2912014-04-05 14:27:07 +02009065 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009066 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009067 const Py_UCS1 *in, *end;
9068 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009069 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009070
Victor Stinner89a76ab2014-04-05 11:44:04 +02009071 len = PyUnicode_GET_LENGTH(input);
9072
Victor Stinner872b2912014-04-05 14:27:07 +02009073 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009074
9075 in = PyUnicode_1BYTE_DATA(input);
9076 end = in + len;
9077
9078 assert(PyUnicode_IS_ASCII(writer->buffer));
9079 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9080 out = PyUnicode_1BYTE_DATA(writer->buffer);
9081
Victor Stinner872b2912014-04-05 14:27:07 +02009082 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009083 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009084 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009085 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009086 int translate = unicode_fast_translate_lookup(mapping, ch,
9087 ascii_table);
9088 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009089 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009090 if (translate == 0)
9091 goto exit;
9092 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009093 }
Victor Stinner872b2912014-04-05 14:27:07 +02009094 if (ch2 == 0xfe) {
9095 if (ignore)
9096 continue;
9097 goto exit;
9098 }
9099 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009100 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009101 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009102 }
Victor Stinner872b2912014-04-05 14:27:07 +02009103 res = 1;
9104
9105exit:
9106 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009107 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009108 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009109}
9110
Victor Stinner3222da22015-10-01 22:07:32 +02009111static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112_PyUnicode_TranslateCharmap(PyObject *input,
9113 PyObject *mapping,
9114 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009117 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 Py_ssize_t size, i;
9119 int kind;
9120 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009121 _PyUnicodeWriter writer;
9122 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009123 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009124 PyObject *errorHandler = NULL;
9125 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009126 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009127 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009128
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 PyErr_BadArgument();
9131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 if (PyUnicode_READY(input) == -1)
9135 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009136 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137 kind = PyUnicode_KIND(input);
9138 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009140 if (size == 0)
9141 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009143 /* allocate enough for a simple 1:1 translation without
9144 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009145 _PyUnicodeWriter_Init(&writer);
9146 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148
Victor Stinner872b2912014-04-05 14:27:07 +02009149 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9150
Victor Stinner33798672016-03-01 21:59:58 +01009151 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009152 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009153 if (PyUnicode_IS_ASCII(input)) {
9154 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9155 if (res < 0) {
9156 _PyUnicodeWriter_Dealloc(&writer);
9157 return NULL;
9158 }
9159 if (res == 1)
9160 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009161 }
Victor Stinner33798672016-03-01 21:59:58 +01009162 else {
9163 i = 0;
9164 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009168 int translate;
9169 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9170 Py_ssize_t newpos;
9171 /* startpos for collecting untranslatable chars */
9172 Py_ssize_t collstart;
9173 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009174 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175
Victor Stinner1194ea02014-04-04 19:37:40 +02009176 ch = PyUnicode_READ(kind, data, i);
9177 translate = charmaptranslate_output(ch, mapping, &writer);
9178 if (translate < 0)
9179 goto onError;
9180
9181 if (translate != 0) {
9182 /* it worked => adjust input pointer */
9183 ++i;
9184 continue;
9185 }
9186
9187 /* untranslatable character */
9188 collstart = i;
9189 collend = i+1;
9190
9191 /* find all untranslatable characters */
9192 while (collend < size) {
9193 PyObject *x;
9194 ch = PyUnicode_READ(kind, data, collend);
9195 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009196 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009197 Py_XDECREF(x);
9198 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009200 ++collend;
9201 }
9202
9203 if (ignore) {
9204 i = collend;
9205 }
9206 else {
9207 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9208 reason, input, &exc,
9209 collstart, collend, &newpos);
9210 if (repunicode == NULL)
9211 goto onError;
9212 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009213 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009214 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009215 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009216 Py_DECREF(repunicode);
9217 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009218 }
9219 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009220 Py_XDECREF(exc);
9221 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009222 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009225 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009226 Py_XDECREF(exc);
9227 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228 return NULL;
9229}
9230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231/* Deprecated. Use PyUnicode_Translate instead. */
9232PyObject *
9233PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9234 Py_ssize_t size,
9235 PyObject *mapping,
9236 const char *errors)
9237{
Christian Heimes5f520f42012-09-11 14:03:25 +02009238 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009239 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 if (!unicode)
9241 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009242 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9243 Py_DECREF(unicode);
9244 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245}
9246
Alexander Belopolsky40018472011-02-26 01:02:56 +00009247PyObject *
9248PyUnicode_Translate(PyObject *str,
9249 PyObject *mapping,
9250 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009252 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009253 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009254 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255}
Tim Petersced69f82003-09-16 20:30:58 +00009256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257PyObject *
9258_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9259{
9260 if (!PyUnicode_Check(unicode)) {
9261 PyErr_BadInternalCall();
9262 return NULL;
9263 }
9264 if (PyUnicode_READY(unicode) == -1)
9265 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009266 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 /* If the string is already ASCII, just return the same string */
9268 Py_INCREF(unicode);
9269 return unicode;
9270 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009271
9272 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9273 PyObject *result = PyUnicode_New(len, 127);
9274 if (result == NULL) {
9275 return NULL;
9276 }
9277
9278 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9279 int kind = PyUnicode_KIND(unicode);
9280 const void *data = PyUnicode_DATA(unicode);
9281 Py_ssize_t i;
9282 for (i = 0; i < len; ++i) {
9283 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9284 if (ch < 127) {
9285 out[i] = ch;
9286 }
9287 else if (Py_UNICODE_ISSPACE(ch)) {
9288 out[i] = ' ';
9289 }
9290 else {
9291 int decimal = Py_UNICODE_TODECIMAL(ch);
9292 if (decimal < 0) {
9293 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009294 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009295 _PyUnicode_LENGTH(result) = i + 1;
9296 break;
9297 }
9298 out[i] = '0' + decimal;
9299 }
9300 }
9301
INADA Naoki16dfca42018-07-14 12:06:43 +09009302 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009303 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304}
9305
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009306PyObject *
9307PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9308 Py_ssize_t length)
9309{
Victor Stinnerf0124502011-11-21 23:12:56 +01009310 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009311 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009312 Py_UCS4 maxchar;
9313 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009314 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009315
Victor Stinner99d7ad02012-02-22 13:37:39 +01009316 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009317 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009318 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009319 if (ch > 127) {
9320 int decimal = Py_UNICODE_TODECIMAL(ch);
9321 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009322 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009323 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009324 }
9325 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009326
9327 /* Copy to a new string */
9328 decimal = PyUnicode_New(length, maxchar);
9329 if (decimal == NULL)
9330 return decimal;
9331 kind = PyUnicode_KIND(decimal);
9332 data = PyUnicode_DATA(decimal);
9333 /* Iterate over code points */
9334 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009335 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009336 if (ch > 127) {
9337 int decimal = Py_UNICODE_TODECIMAL(ch);
9338 if (decimal >= 0)
9339 ch = '0' + decimal;
9340 }
9341 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009343 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009344}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009345/* --- Decimal Encoder ---------------------------------------------------- */
9346
Alexander Belopolsky40018472011-02-26 01:02:56 +00009347int
9348PyUnicode_EncodeDecimal(Py_UNICODE *s,
9349 Py_ssize_t length,
9350 char *output,
9351 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009352{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009353 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009354 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009355 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009356 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009357
9358 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 PyErr_BadArgument();
9360 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009361 }
9362
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009363 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009364 if (unicode == NULL)
9365 return -1;
9366
Victor Stinner42bf7752011-11-21 22:52:58 +01009367 kind = PyUnicode_KIND(unicode);
9368 data = PyUnicode_DATA(unicode);
9369
Victor Stinnerb84d7232011-11-22 01:50:07 +01009370 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009371 PyObject *exc;
9372 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009373 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009374 Py_ssize_t startpos;
9375
9376 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009377
Benjamin Peterson29060642009-01-31 22:14:21 +00009378 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009379 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009380 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009381 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009382 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009383 decimal = Py_UNICODE_TODECIMAL(ch);
9384 if (decimal >= 0) {
9385 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009386 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 continue;
9388 }
9389 if (0 < ch && ch < 256) {
9390 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009391 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009392 continue;
9393 }
Victor Stinner6345be92011-11-25 20:09:01 +01009394
Victor Stinner42bf7752011-11-21 22:52:58 +01009395 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009396 exc = NULL;
9397 raise_encode_exception(&exc, "decimal", unicode,
9398 startpos, startpos+1,
9399 "invalid decimal Unicode string");
9400 Py_XDECREF(exc);
9401 Py_DECREF(unicode);
9402 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009403 }
9404 /* 0-terminate the output string */
9405 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009406 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009407 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009408}
9409
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410/* --- Helpers ------------------------------------------------------------ */
9411
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009412/* helper macro to fixup start/end slice values */
9413#define ADJUST_INDICES(start, end, len) \
9414 if (end > len) \
9415 end = len; \
9416 else if (end < 0) { \
9417 end += len; \
9418 if (end < 0) \
9419 end = 0; \
9420 } \
9421 if (start < 0) { \
9422 start += len; \
9423 if (start < 0) \
9424 start = 0; \
9425 }
9426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009428any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009430 Py_ssize_t end,
9431 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009434 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 Py_ssize_t len1, len2, result;
9436
9437 kind1 = PyUnicode_KIND(s1);
9438 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009439 if (kind1 < kind2)
9440 return -1;
9441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 len1 = PyUnicode_GET_LENGTH(s1);
9443 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009444 ADJUST_INDICES(start, end, len1);
9445 if (end - start < len2)
9446 return -1;
9447
9448 buf1 = PyUnicode_DATA(s1);
9449 buf2 = PyUnicode_DATA(s2);
9450 if (len2 == 1) {
9451 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9452 result = findchar((const char *)buf1 + kind1*start,
9453 kind1, end - start, ch, direction);
9454 if (result == -1)
9455 return -1;
9456 else
9457 return start + result;
9458 }
9459
9460 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009461 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009462 if (!buf2)
9463 return -2;
9464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465
Victor Stinner794d5672011-10-10 03:21:36 +02009466 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009467 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009468 case PyUnicode_1BYTE_KIND:
9469 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9470 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9471 else
9472 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9473 break;
9474 case PyUnicode_2BYTE_KIND:
9475 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9476 break;
9477 case PyUnicode_4BYTE_KIND:
9478 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9479 break;
9480 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009481 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009482 }
9483 }
9484 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009485 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009486 case PyUnicode_1BYTE_KIND:
9487 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9488 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9489 else
9490 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9491 break;
9492 case PyUnicode_2BYTE_KIND:
9493 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9494 break;
9495 case PyUnicode_4BYTE_KIND:
9496 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9497 break;
9498 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009499 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009500 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 }
9502
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009503 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009504 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009505 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506
9507 return result;
9508}
9509
Victor Stinner59423e32018-11-26 13:40:01 +01009510/* _PyUnicode_InsertThousandsGrouping() helper functions */
9511#include "stringlib/localeutil.h"
9512
9513/**
9514 * InsertThousandsGrouping:
9515 * @writer: Unicode writer.
9516 * @n_buffer: Number of characters in @buffer.
9517 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9518 * @d_pos: Start of digits string.
9519 * @n_digits: The number of digits in the string, in which we want
9520 * to put the grouping chars.
9521 * @min_width: The minimum width of the digits in the output string.
9522 * Output will be zero-padded on the left to fill.
9523 * @grouping: see definition in localeconv().
9524 * @thousands_sep: see definition in localeconv().
9525 *
9526 * There are 2 modes: counting and filling. If @writer is NULL,
9527 * we are in counting mode, else filling mode.
9528 * If counting, the required buffer size is returned.
9529 * If filling, we know the buffer will be large enough, so we don't
9530 * need to pass in the buffer size.
9531 * Inserts thousand grouping characters (as defined by grouping and
9532 * thousands_sep) into @writer.
9533 *
9534 * Return value: -1 on error, number of characters otherwise.
9535 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009537_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009538 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009539 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009540 PyObject *digits,
9541 Py_ssize_t d_pos,
9542 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009543 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009544 const char *grouping,
9545 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009546 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547{
Xtreak3f7983a2019-01-07 20:39:14 +05309548 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009549 if (writer) {
9550 assert(digits != NULL);
9551 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009552 }
9553 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009554 assert(digits == NULL);
9555 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009556 }
Victor Stinner59423e32018-11-26 13:40:01 +01009557 assert(0 <= d_pos);
9558 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009559 assert(grouping != NULL);
9560
9561 if (digits != NULL) {
9562 if (PyUnicode_READY(digits) == -1) {
9563 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009564 }
Victor Stinner59423e32018-11-26 13:40:01 +01009565 }
9566 if (PyUnicode_READY(thousands_sep) == -1) {
9567 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009568 }
9569
Victor Stinner59423e32018-11-26 13:40:01 +01009570 Py_ssize_t count = 0;
9571 Py_ssize_t n_zeros;
9572 int loop_broken = 0;
9573 int use_separator = 0; /* First time through, don't append the
9574 separator. They only go between
9575 groups. */
9576 Py_ssize_t buffer_pos;
9577 Py_ssize_t digits_pos;
9578 Py_ssize_t len;
9579 Py_ssize_t n_chars;
9580 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9581 be looked at */
9582 /* A generator that returns all of the grouping widths, until it
9583 returns 0. */
9584 GroupGenerator groupgen;
9585 GroupGenerator_init(&groupgen, grouping);
9586 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9587
9588 /* if digits are not grouped, thousands separator
9589 should be an empty string */
9590 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9591
9592 digits_pos = d_pos + n_digits;
9593 if (writer) {
9594 buffer_pos = writer->pos + n_buffer;
9595 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9596 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597 }
Victor Stinner59423e32018-11-26 13:40:01 +01009598 else {
9599 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009600 }
Victor Stinner59423e32018-11-26 13:40:01 +01009601
9602 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009603 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009604 }
Victor Stinner59423e32018-11-26 13:40:01 +01009605
9606 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9607 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9608 n_zeros = Py_MAX(0, len - remaining);
9609 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9610
9611 /* Use n_zero zero's and n_chars chars */
9612
9613 /* Count only, don't do anything. */
9614 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9615
9616 /* Copy into the writer. */
9617 InsertThousandsGrouping_fill(writer, &buffer_pos,
9618 digits, &digits_pos,
9619 n_chars, n_zeros,
9620 use_separator ? thousands_sep : NULL,
9621 thousands_sep_len, maxchar);
9622
9623 /* Use a separator next time. */
9624 use_separator = 1;
9625
9626 remaining -= n_chars;
9627 min_width -= len;
9628
9629 if (remaining <= 0 && min_width <= 0) {
9630 loop_broken = 1;
9631 break;
9632 }
9633 min_width -= thousands_sep_len;
9634 }
9635 if (!loop_broken) {
9636 /* We left the loop without using a break statement. */
9637
9638 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9639 n_zeros = Py_MAX(0, len - remaining);
9640 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9641
9642 /* Use n_zero zero's and n_chars chars */
9643 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9644
9645 /* Copy into the writer. */
9646 InsertThousandsGrouping_fill(writer, &buffer_pos,
9647 digits, &digits_pos,
9648 n_chars, n_zeros,
9649 use_separator ? thousands_sep : NULL,
9650 thousands_sep_len, maxchar);
9651 }
9652 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653}
9654
9655
Alexander Belopolsky40018472011-02-26 01:02:56 +00009656Py_ssize_t
9657PyUnicode_Count(PyObject *str,
9658 PyObject *substr,
9659 Py_ssize_t start,
9660 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009662 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009663 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009664 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009666
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009667 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009668 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009669
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009670 kind1 = PyUnicode_KIND(str);
9671 kind2 = PyUnicode_KIND(substr);
9672 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009673 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009674
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009675 len1 = PyUnicode_GET_LENGTH(str);
9676 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009678 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009679 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009680
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009681 buf1 = PyUnicode_DATA(str);
9682 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009683 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009684 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009685 if (!buf2)
9686 goto onError;
9687 }
9688
9689 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009691 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009692 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009693 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009694 buf2, len2, PY_SSIZE_T_MAX
9695 );
9696 else
9697 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009698 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009699 buf2, len2, PY_SSIZE_T_MAX
9700 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 break;
9702 case PyUnicode_2BYTE_KIND:
9703 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009704 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 buf2, len2, PY_SSIZE_T_MAX
9706 );
9707 break;
9708 case PyUnicode_4BYTE_KIND:
9709 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009710 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 buf2, len2, PY_SSIZE_T_MAX
9712 );
9713 break;
9714 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009715 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009717
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009718 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009719 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009720 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009724 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9725 if (kind2 != kind1)
9726 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728}
9729
Alexander Belopolsky40018472011-02-26 01:02:56 +00009730Py_ssize_t
9731PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009732 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009733 Py_ssize_t start,
9734 Py_ssize_t end,
9735 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009737 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009738 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009739
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009740 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741}
9742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743Py_ssize_t
9744PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9745 Py_ssize_t start, Py_ssize_t end,
9746 int direction)
9747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009749 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 if (PyUnicode_READY(str) == -1)
9751 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009752 len = PyUnicode_GET_LENGTH(str);
9753 ADJUST_INDICES(start, end, len);
9754 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009755 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009757 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9758 kind, end-start, ch, direction);
9759 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009761 else
9762 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763}
9764
Alexander Belopolsky40018472011-02-26 01:02:56 +00009765static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009766tailmatch(PyObject *self,
9767 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009768 Py_ssize_t start,
9769 Py_ssize_t end,
9770 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 int kind_self;
9773 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009774 const void *data_self;
9775 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776 Py_ssize_t offset;
9777 Py_ssize_t i;
9778 Py_ssize_t end_sub;
9779
9780 if (PyUnicode_READY(self) == -1 ||
9781 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009782 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9785 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009787 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009789 if (PyUnicode_GET_LENGTH(substring) == 0)
9790 return 1;
9791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 kind_self = PyUnicode_KIND(self);
9793 data_self = PyUnicode_DATA(self);
9794 kind_sub = PyUnicode_KIND(substring);
9795 data_sub = PyUnicode_DATA(substring);
9796 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9797
9798 if (direction > 0)
9799 offset = end;
9800 else
9801 offset = start;
9802
9803 if (PyUnicode_READ(kind_self, data_self, offset) ==
9804 PyUnicode_READ(kind_sub, data_sub, 0) &&
9805 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9806 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9807 /* If both are of the same kind, memcmp is sufficient */
9808 if (kind_self == kind_sub) {
9809 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009810 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 data_sub,
9812 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009813 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009815 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 else {
9817 /* We do not need to compare 0 and len(substring)-1 because
9818 the if statement above ensured already that they are equal
9819 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 for (i = 1; i < end_sub; ++i) {
9821 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9822 PyUnicode_READ(kind_sub, data_sub, i))
9823 return 0;
9824 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009825 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827 }
9828
9829 return 0;
9830}
9831
Alexander Belopolsky40018472011-02-26 01:02:56 +00009832Py_ssize_t
9833PyUnicode_Tailmatch(PyObject *str,
9834 PyObject *substr,
9835 Py_ssize_t start,
9836 Py_ssize_t end,
9837 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009838{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009839 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009840 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009841
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009842 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843}
9844
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009845static PyObject *
9846ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009848 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009849 const char *data = PyUnicode_DATA(self);
9850 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009851 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009852
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009853 res = PyUnicode_New(len, 127);
9854 if (res == NULL)
9855 return NULL;
9856 resdata = PyUnicode_DATA(res);
9857 if (lower)
9858 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009860 _Py_bytes_upper(resdata, data, len);
9861 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862}
9863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009865handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009867 Py_ssize_t j;
9868 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009869 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009870 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009871
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009872 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9873
9874 where ! is a negation and \p{xxx} is a character with property xxx.
9875 */
9876 for (j = i - 1; j >= 0; j--) {
9877 c = PyUnicode_READ(kind, data, j);
9878 if (!_PyUnicode_IsCaseIgnorable(c))
9879 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009881 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9882 if (final_sigma) {
9883 for (j = i + 1; j < length; j++) {
9884 c = PyUnicode_READ(kind, data, j);
9885 if (!_PyUnicode_IsCaseIgnorable(c))
9886 break;
9887 }
9888 final_sigma = j == length || !_PyUnicode_IsCased(c);
9889 }
9890 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891}
9892
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009893static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009894lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009895 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009897 /* Obscure special case. */
9898 if (c == 0x3A3) {
9899 mapped[0] = handle_capital_sigma(kind, data, length, i);
9900 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009902 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903}
9904
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009905static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009906do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009908 Py_ssize_t i, k = 0;
9909 int n_res, j;
9910 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009911
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009912 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009913 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009914 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009915 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009916 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009918 for (i = 1; i < length; i++) {
9919 c = PyUnicode_READ(kind, data, i);
9920 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9921 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009922 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009923 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009924 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009925 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009926 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927}
9928
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009929static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009930do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009931 Py_ssize_t i, k = 0;
9932
9933 for (i = 0; i < length; i++) {
9934 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9935 int n_res, j;
9936 if (Py_UNICODE_ISUPPER(c)) {
9937 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9938 }
9939 else if (Py_UNICODE_ISLOWER(c)) {
9940 n_res = _PyUnicode_ToUpperFull(c, mapped);
9941 }
9942 else {
9943 n_res = 1;
9944 mapped[0] = c;
9945 }
9946 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009947 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009948 res[k++] = mapped[j];
9949 }
9950 }
9951 return k;
9952}
9953
9954static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009955do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009956 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009958 Py_ssize_t i, k = 0;
9959
9960 for (i = 0; i < length; i++) {
9961 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9962 int n_res, j;
9963 if (lower)
9964 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9965 else
9966 n_res = _PyUnicode_ToUpperFull(c, mapped);
9967 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009968 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009969 res[k++] = mapped[j];
9970 }
9971 }
9972 return k;
9973}
9974
9975static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009976do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009977{
9978 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9979}
9980
9981static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009982do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009983{
9984 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9985}
9986
Benjamin Petersone51757f2012-01-12 21:10:29 -05009987static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009988do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -05009989{
9990 Py_ssize_t i, k = 0;
9991
9992 for (i = 0; i < length; i++) {
9993 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9994 Py_UCS4 mapped[3];
9995 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9996 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009997 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009998 res[k++] = mapped[j];
9999 }
10000 }
10001 return k;
10002}
10003
10004static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010005do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010006{
10007 Py_ssize_t i, k = 0;
10008 int previous_is_cased;
10009
10010 previous_is_cased = 0;
10011 for (i = 0; i < length; i++) {
10012 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10013 Py_UCS4 mapped[3];
10014 int n_res, j;
10015
10016 if (previous_is_cased)
10017 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10018 else
10019 n_res = _PyUnicode_ToTitleFull(c, mapped);
10020
10021 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010022 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010023 res[k++] = mapped[j];
10024 }
10025
10026 previous_is_cased = _PyUnicode_IsCased(c);
10027 }
10028 return k;
10029}
10030
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010031static PyObject *
10032case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010033 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010034{
10035 PyObject *res = NULL;
10036 Py_ssize_t length, newlength = 0;
10037 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010038 const void *data;
10039 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010040 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10041
Benjamin Petersoneea48462012-01-16 14:28:50 -050010042 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010043
10044 kind = PyUnicode_KIND(self);
10045 data = PyUnicode_DATA(self);
10046 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010047 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010048 PyErr_SetString(PyExc_OverflowError, "string is too long");
10049 return NULL;
10050 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010051 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010052 if (tmp == NULL)
10053 return PyErr_NoMemory();
10054 newlength = perform(kind, data, length, tmp, &maxchar);
10055 res = PyUnicode_New(newlength, maxchar);
10056 if (res == NULL)
10057 goto leave;
10058 tmpend = tmp + newlength;
10059 outdata = PyUnicode_DATA(res);
10060 outkind = PyUnicode_KIND(res);
10061 switch (outkind) {
10062 case PyUnicode_1BYTE_KIND:
10063 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10064 break;
10065 case PyUnicode_2BYTE_KIND:
10066 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10067 break;
10068 case PyUnicode_4BYTE_KIND:
10069 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10070 break;
10071 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010072 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010073 }
10074 leave:
10075 PyMem_FREE(tmp);
10076 return res;
10077}
10078
Tim Peters8ce9f162004-08-27 01:49:32 +000010079PyObject *
10080PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010082 PyObject *res;
10083 PyObject *fseq;
10084 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010085 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010086
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010087 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010088 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010089 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010090 }
10091
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010092 /* NOTE: the following code can't call back into Python code,
10093 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010094 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010095
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010096 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010097 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010098 res = _PyUnicode_JoinArray(separator, items, seqlen);
10099 Py_DECREF(fseq);
10100 return res;
10101}
10102
10103PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010104_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010105{
10106 PyObject *res = NULL; /* the result */
10107 PyObject *sep = NULL;
10108 Py_ssize_t seplen;
10109 PyObject *item;
10110 Py_ssize_t sz, i, res_offset;
10111 Py_UCS4 maxchar;
10112 Py_UCS4 item_maxchar;
10113 int use_memcpy;
10114 unsigned char *res_data = NULL, *sep_data = NULL;
10115 PyObject *last_obj;
10116 unsigned int kind = 0;
10117
Tim Peters05eba1f2004-08-27 21:32:02 +000010118 /* If empty sequence, return u"". */
10119 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010120 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010121 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010122
Tim Peters05eba1f2004-08-27 21:32:02 +000010123 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010124 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010125 if (seqlen == 1) {
10126 if (PyUnicode_CheckExact(items[0])) {
10127 res = items[0];
10128 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010129 return res;
10130 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010131 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010132 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010133 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010134 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010135 /* Set up sep and seplen */
10136 if (separator == NULL) {
10137 /* fall back to a blank space separator */
10138 sep = PyUnicode_FromOrdinal(' ');
10139 if (!sep)
10140 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010141 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010142 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010143 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010144 else {
10145 if (!PyUnicode_Check(separator)) {
10146 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010147 "separator: expected str instance,"
10148 " %.80s found",
10149 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010150 goto onError;
10151 }
10152 if (PyUnicode_READY(separator))
10153 goto onError;
10154 sep = separator;
10155 seplen = PyUnicode_GET_LENGTH(separator);
10156 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10157 /* inc refcount to keep this code path symmetric with the
10158 above case of a blank separator */
10159 Py_INCREF(sep);
10160 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010161 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010162 }
10163
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010164 /* There are at least two things to join, or else we have a subclass
10165 * of str in the sequence.
10166 * Do a pre-pass to figure out the total amount of space we'll
10167 * need (sz), and see whether all argument are strings.
10168 */
10169 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010170#ifdef Py_DEBUG
10171 use_memcpy = 0;
10172#else
10173 use_memcpy = 1;
10174#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010175 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010176 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010177 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010178 if (!PyUnicode_Check(item)) {
10179 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010180 "sequence item %zd: expected str instance,"
10181 " %.80s found",
10182 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010183 goto onError;
10184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 if (PyUnicode_READY(item) == -1)
10186 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010187 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010189 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010190 if (i != 0) {
10191 add_sz += seplen;
10192 }
10193 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010194 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010195 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010196 goto onError;
10197 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010198 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010199 if (use_memcpy && last_obj != NULL) {
10200 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10201 use_memcpy = 0;
10202 }
10203 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010204 }
Tim Petersced69f82003-09-16 20:30:58 +000010205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010207 if (res == NULL)
10208 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010209
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010210 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010211#ifdef Py_DEBUG
10212 use_memcpy = 0;
10213#else
10214 if (use_memcpy) {
10215 res_data = PyUnicode_1BYTE_DATA(res);
10216 kind = PyUnicode_KIND(res);
10217 if (seplen != 0)
10218 sep_data = PyUnicode_1BYTE_DATA(sep);
10219 }
10220#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010221 if (use_memcpy) {
10222 for (i = 0; i < seqlen; ++i) {
10223 Py_ssize_t itemlen;
10224 item = items[i];
10225
10226 /* Copy item, and maybe the separator. */
10227 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010228 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010229 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 kind * seplen);
10231 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010232 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010233
10234 itemlen = PyUnicode_GET_LENGTH(item);
10235 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010236 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010237 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010238 kind * itemlen);
10239 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010240 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010241 }
10242 assert(res_data == PyUnicode_1BYTE_DATA(res)
10243 + kind * PyUnicode_GET_LENGTH(res));
10244 }
10245 else {
10246 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10247 Py_ssize_t itemlen;
10248 item = items[i];
10249
10250 /* Copy item, and maybe the separator. */
10251 if (i && seplen != 0) {
10252 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10253 res_offset += seplen;
10254 }
10255
10256 itemlen = PyUnicode_GET_LENGTH(item);
10257 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010258 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010259 res_offset += itemlen;
10260 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010261 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010262 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010263 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010266 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268
Benjamin Peterson29060642009-01-31 22:14:21 +000010269 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010271 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272 return NULL;
10273}
10274
Victor Stinnerd3f08822012-05-29 12:57:52 +020010275void
10276_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10277 Py_UCS4 fill_char)
10278{
10279 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010280 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010281 assert(PyUnicode_IS_READY(unicode));
10282 assert(unicode_modifiable(unicode));
10283 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10284 assert(start >= 0);
10285 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010286 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010287}
10288
Victor Stinner3fe55312012-01-04 00:33:50 +010010289Py_ssize_t
10290PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10291 Py_UCS4 fill_char)
10292{
10293 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010294
10295 if (!PyUnicode_Check(unicode)) {
10296 PyErr_BadInternalCall();
10297 return -1;
10298 }
10299 if (PyUnicode_READY(unicode) == -1)
10300 return -1;
10301 if (unicode_check_modifiable(unicode))
10302 return -1;
10303
Victor Stinnerd3f08822012-05-29 12:57:52 +020010304 if (start < 0) {
10305 PyErr_SetString(PyExc_IndexError, "string index out of range");
10306 return -1;
10307 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010308 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10309 PyErr_SetString(PyExc_ValueError,
10310 "fill character is bigger than "
10311 "the string maximum character");
10312 return -1;
10313 }
10314
10315 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10316 length = Py_MIN(maxlen, length);
10317 if (length <= 0)
10318 return 0;
10319
Victor Stinnerd3f08822012-05-29 12:57:52 +020010320 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010321 return length;
10322}
10323
Victor Stinner9310abb2011-10-05 00:59:23 +020010324static PyObject *
10325pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010326 Py_ssize_t left,
10327 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 PyObject *u;
10331 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010332 int kind;
10333 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334
10335 if (left < 0)
10336 left = 0;
10337 if (right < 0)
10338 right = 0;
10339
Victor Stinnerc4b49542011-12-11 22:44:26 +010010340 if (left == 0 && right == 0)
10341 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10344 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010345 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10346 return NULL;
10347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010349 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010351 if (!u)
10352 return NULL;
10353
10354 kind = PyUnicode_KIND(u);
10355 data = PyUnicode_DATA(u);
10356 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010357 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010358 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010359 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010360 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010361 assert(_PyUnicode_CheckConsistency(u, 1));
10362 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363}
10364
Alexander Belopolsky40018472011-02-26 01:02:56 +000010365PyObject *
10366PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010370 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372
Benjamin Petersonead6b532011-12-20 17:23:42 -060010373 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010375 if (PyUnicode_IS_ASCII(string))
10376 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010377 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010378 PyUnicode_GET_LENGTH(string), keepends);
10379 else
10380 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010381 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010382 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 break;
10384 case PyUnicode_2BYTE_KIND:
10385 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010386 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 PyUnicode_GET_LENGTH(string), keepends);
10388 break;
10389 case PyUnicode_4BYTE_KIND:
10390 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010391 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 PyUnicode_GET_LENGTH(string), keepends);
10393 break;
10394 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010395 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398}
10399
Alexander Belopolsky40018472011-02-26 01:02:56 +000010400static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010401split(PyObject *self,
10402 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010403 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010405 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010406 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 Py_ssize_t len1, len2;
10408 PyObject* out;
10409
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010411 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 if (PyUnicode_READY(self) == -1)
10414 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010417 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010419 if (PyUnicode_IS_ASCII(self))
10420 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010421 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010422 PyUnicode_GET_LENGTH(self), maxcount
10423 );
10424 else
10425 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010426 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010427 PyUnicode_GET_LENGTH(self), maxcount
10428 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 case PyUnicode_2BYTE_KIND:
10430 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010431 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 PyUnicode_GET_LENGTH(self), maxcount
10433 );
10434 case PyUnicode_4BYTE_KIND:
10435 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010436 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 PyUnicode_GET_LENGTH(self), maxcount
10438 );
10439 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010440 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 }
10442
10443 if (PyUnicode_READY(substring) == -1)
10444 return NULL;
10445
10446 kind1 = PyUnicode_KIND(self);
10447 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 len1 = PyUnicode_GET_LENGTH(self);
10449 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010450 if (kind1 < kind2 || len1 < len2) {
10451 out = PyList_New(1);
10452 if (out == NULL)
10453 return NULL;
10454 Py_INCREF(self);
10455 PyList_SET_ITEM(out, 0, self);
10456 return out;
10457 }
10458 buf1 = PyUnicode_DATA(self);
10459 buf2 = PyUnicode_DATA(substring);
10460 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010461 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010462 if (!buf2)
10463 return NULL;
10464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010466 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010468 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10469 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010470 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010471 else
10472 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010473 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 break;
10475 case PyUnicode_2BYTE_KIND:
10476 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010477 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 break;
10479 case PyUnicode_4BYTE_KIND:
10480 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010481 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 break;
10483 default:
10484 out = NULL;
10485 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010486 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010487 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010488 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490}
10491
Alexander Belopolsky40018472011-02-26 01:02:56 +000010492static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010493rsplit(PyObject *self,
10494 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010495 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010496{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010497 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010498 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 Py_ssize_t len1, len2;
10500 PyObject* out;
10501
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010502 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010503 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 if (PyUnicode_READY(self) == -1)
10506 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010509 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010511 if (PyUnicode_IS_ASCII(self))
10512 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010513 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010514 PyUnicode_GET_LENGTH(self), maxcount
10515 );
10516 else
10517 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010518 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010519 PyUnicode_GET_LENGTH(self), maxcount
10520 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 case PyUnicode_2BYTE_KIND:
10522 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010523 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 PyUnicode_GET_LENGTH(self), maxcount
10525 );
10526 case PyUnicode_4BYTE_KIND:
10527 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010528 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 PyUnicode_GET_LENGTH(self), maxcount
10530 );
10531 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010532 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 }
10534
10535 if (PyUnicode_READY(substring) == -1)
10536 return NULL;
10537
10538 kind1 = PyUnicode_KIND(self);
10539 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 len1 = PyUnicode_GET_LENGTH(self);
10541 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010542 if (kind1 < kind2 || len1 < len2) {
10543 out = PyList_New(1);
10544 if (out == NULL)
10545 return NULL;
10546 Py_INCREF(self);
10547 PyList_SET_ITEM(out, 0, self);
10548 return out;
10549 }
10550 buf1 = PyUnicode_DATA(self);
10551 buf2 = PyUnicode_DATA(substring);
10552 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010553 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010554 if (!buf2)
10555 return NULL;
10556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010558 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010560 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10561 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010562 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010563 else
10564 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010565 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 break;
10567 case PyUnicode_2BYTE_KIND:
10568 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010569 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 break;
10571 case PyUnicode_4BYTE_KIND:
10572 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010573 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 break;
10575 default:
10576 out = NULL;
10577 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010578 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010579 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010580 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 return out;
10582}
10583
10584static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010585anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10586 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010588 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010590 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10591 return asciilib_find(buf1, len1, buf2, len2, offset);
10592 else
10593 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 case PyUnicode_2BYTE_KIND:
10595 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10596 case PyUnicode_4BYTE_KIND:
10597 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10598 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010599 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600}
10601
10602static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010603anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10604 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010606 switch (kind) {
10607 case PyUnicode_1BYTE_KIND:
10608 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10609 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10610 else
10611 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10612 case PyUnicode_2BYTE_KIND:
10613 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10614 case PyUnicode_4BYTE_KIND:
10615 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10616 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010617 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010618}
10619
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010620static void
10621replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10622 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10623{
10624 int kind = PyUnicode_KIND(u);
10625 void *data = PyUnicode_DATA(u);
10626 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10627 if (kind == PyUnicode_1BYTE_KIND) {
10628 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10629 (Py_UCS1 *)data + len,
10630 u1, u2, maxcount);
10631 }
10632 else if (kind == PyUnicode_2BYTE_KIND) {
10633 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10634 (Py_UCS2 *)data + len,
10635 u1, u2, maxcount);
10636 }
10637 else {
10638 assert(kind == PyUnicode_4BYTE_KIND);
10639 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10640 (Py_UCS4 *)data + len,
10641 u1, u2, maxcount);
10642 }
10643}
10644
Alexander Belopolsky40018472011-02-26 01:02:56 +000010645static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646replace(PyObject *self, PyObject *str1,
10647 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010650 const char *sbuf = PyUnicode_DATA(self);
10651 const void *buf1 = PyUnicode_DATA(str1);
10652 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 int srelease = 0, release1 = 0, release2 = 0;
10654 int skind = PyUnicode_KIND(self);
10655 int kind1 = PyUnicode_KIND(str1);
10656 int kind2 = PyUnicode_KIND(str2);
10657 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10658 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10659 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010660 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010661 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010663 if (slen < len1)
10664 goto nothing;
10665
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010667 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010668 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010669 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670
Victor Stinner59de0ee2011-10-07 10:01:28 +020010671 if (str1 == str2)
10672 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673
Victor Stinner49a0a212011-10-12 23:46:10 +020010674 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010675 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10676 if (maxchar < maxchar_str1)
10677 /* substring too wide to be present */
10678 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010679 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10680 /* Replacing str1 with str2 may cause a maxchar reduction in the
10681 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010682 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010683 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010688 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010690 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010691 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010692 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010693
Victor Stinner69ed0f42013-04-09 21:48:24 +020010694 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010695 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010696 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010698 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010700 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010702
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010703 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10704 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010705 }
10706 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 int rkind = skind;
10708 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010709 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711 if (kind1 < rkind) {
10712 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010713 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 if (!buf1) goto error;
10715 release1 = 1;
10716 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010717 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010718 if (i < 0)
10719 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 if (rkind > kind2) {
10721 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010722 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 if (!buf2) goto error;
10724 release2 = 1;
10725 }
10726 else if (rkind < kind2) {
10727 /* widen self and buf1 */
10728 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010729 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010730 assert(buf1 != PyUnicode_DATA(str1));
10731 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010732 buf1 = PyUnicode_DATA(str1);
10733 release1 = 0;
10734 }
10735 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 if (!sbuf) goto error;
10737 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010738 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 if (!buf1) goto error;
10740 release1 = 1;
10741 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010742 u = PyUnicode_New(slen, maxchar);
10743 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010745 assert(PyUnicode_KIND(u) == rkind);
10746 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010747
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010748 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010749 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010750 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010752 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010754
10755 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010756 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010757 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010758 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010759 if (i == -1)
10760 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010761 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010763 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010767 }
10768 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010770 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 int rkind = skind;
10772 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010775 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010776 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 if (!buf1) goto error;
10778 release1 = 1;
10779 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010780 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010781 if (n == 0)
10782 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010784 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010785 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 if (!buf2) goto error;
10787 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010790 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010792 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 if (!sbuf) goto error;
10794 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010795 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010796 assert(buf1 != PyUnicode_DATA(str1));
10797 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010798 buf1 = PyUnicode_DATA(str1);
10799 release1 = 0;
10800 }
10801 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 if (!buf1) goto error;
10803 release1 = 1;
10804 }
10805 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10806 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010807 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 PyErr_SetString(PyExc_OverflowError,
10809 "replace string is too long");
10810 goto error;
10811 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010812 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010813 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010814 _Py_INCREF_UNICODE_EMPTY();
10815 if (!unicode_empty)
10816 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010817 u = unicode_empty;
10818 goto done;
10819 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010820 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 PyErr_SetString(PyExc_OverflowError,
10822 "replace string is too long");
10823 goto error;
10824 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010825 u = PyUnicode_New(new_size, maxchar);
10826 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010828 assert(PyUnicode_KIND(u) == rkind);
10829 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 ires = i = 0;
10831 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010832 while (n-- > 0) {
10833 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010834 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010835 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010836 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010837 if (j == -1)
10838 break;
10839 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010840 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010841 memcpy(res + rkind * ires,
10842 sbuf + rkind * i,
10843 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010845 }
10846 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010848 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010850 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010852 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010856 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010857 memcpy(res + rkind * ires,
10858 sbuf + rkind * i,
10859 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010860 }
10861 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010862 /* interleave */
10863 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010864 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010866 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010868 if (--n <= 0)
10869 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010870 memcpy(res + rkind * ires,
10871 sbuf + rkind * i,
10872 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 ires++;
10874 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010875 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010876 memcpy(res + rkind * ires,
10877 sbuf + rkind * i,
10878 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010879 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010880 }
10881
10882 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010883 unicode_adjust_maxchar(&u);
10884 if (u == NULL)
10885 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010887
10888 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010889 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10890 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10891 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010893 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010895 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010897 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010898 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010900
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010902 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010903 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10904 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10905 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010907 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010909 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010911 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010912 return unicode_result_unchanged(self);
10913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010915 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10916 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10917 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10918 if (srelease)
10919 PyMem_FREE((void *)sbuf);
10920 if (release1)
10921 PyMem_FREE((void *)buf1);
10922 if (release2)
10923 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925}
10926
10927/* --- Unicode Object Methods --------------------------------------------- */
10928
INADA Naoki3ae20562017-01-16 20:41:20 +090010929/*[clinic input]
10930str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931
INADA Naoki3ae20562017-01-16 20:41:20 +090010932Return a version of the string where each word is titlecased.
10933
10934More specifically, words start with uppercased characters and all remaining
10935cased characters have lower case.
10936[clinic start generated code]*/
10937
10938static PyObject *
10939unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010940/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010942 if (PyUnicode_READY(self) == -1)
10943 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010944 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945}
10946
INADA Naoki3ae20562017-01-16 20:41:20 +090010947/*[clinic input]
10948str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949
INADA Naoki3ae20562017-01-16 20:41:20 +090010950Return a capitalized version of the string.
10951
10952More specifically, make the first character have upper case and the rest lower
10953case.
10954[clinic start generated code]*/
10955
10956static PyObject *
10957unicode_capitalize_impl(PyObject *self)
10958/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010960 if (PyUnicode_READY(self) == -1)
10961 return NULL;
10962 if (PyUnicode_GET_LENGTH(self) == 0)
10963 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010964 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965}
10966
INADA Naoki3ae20562017-01-16 20:41:20 +090010967/*[clinic input]
10968str.casefold as unicode_casefold
10969
10970Return a version of the string suitable for caseless comparisons.
10971[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010972
10973static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010974unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010975/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010976{
10977 if (PyUnicode_READY(self) == -1)
10978 return NULL;
10979 if (PyUnicode_IS_ASCII(self))
10980 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010981 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010982}
10983
10984
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010985/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010986
10987static int
10988convert_uc(PyObject *obj, void *addr)
10989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010991
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010992 if (!PyUnicode_Check(obj)) {
10993 PyErr_Format(PyExc_TypeError,
10994 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010995 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010996 return 0;
10997 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010998 if (PyUnicode_READY(obj) < 0)
10999 return 0;
11000 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011001 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011003 return 0;
11004 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011005 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011006 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011007}
11008
INADA Naoki3ae20562017-01-16 20:41:20 +090011009/*[clinic input]
11010str.center as unicode_center
11011
11012 width: Py_ssize_t
11013 fillchar: Py_UCS4 = ' '
11014 /
11015
11016Return a centered string of length width.
11017
11018Padding is done using the specified fill character (default is a space).
11019[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020
11021static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011022unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11023/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011025 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026
Benjamin Petersonbac79492012-01-14 13:34:47 -050011027 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028 return NULL;
11029
Victor Stinnerc4b49542011-12-11 22:44:26 +010011030 if (PyUnicode_GET_LENGTH(self) >= width)
11031 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032
Victor Stinnerc4b49542011-12-11 22:44:26 +010011033 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034 left = marg / 2 + (marg & width & 1);
11035
Victor Stinner9310abb2011-10-05 00:59:23 +020011036 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037}
11038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039/* This function assumes that str1 and str2 are readied by the caller. */
11040
Marc-André Lemburge5034372000-08-08 08:04:29 +000011041static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011042unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011043{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011044#define COMPARE(TYPE1, TYPE2) \
11045 do { \
11046 TYPE1* p1 = (TYPE1 *)data1; \
11047 TYPE2* p2 = (TYPE2 *)data2; \
11048 TYPE1* end = p1 + len; \
11049 Py_UCS4 c1, c2; \
11050 for (; p1 != end; p1++, p2++) { \
11051 c1 = *p1; \
11052 c2 = *p2; \
11053 if (c1 != c2) \
11054 return (c1 < c2) ? -1 : 1; \
11055 } \
11056 } \
11057 while (0)
11058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011060 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011061 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 kind1 = PyUnicode_KIND(str1);
11064 kind2 = PyUnicode_KIND(str2);
11065 data1 = PyUnicode_DATA(str1);
11066 data2 = PyUnicode_DATA(str2);
11067 len1 = PyUnicode_GET_LENGTH(str1);
11068 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011069 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011070
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011071 switch(kind1) {
11072 case PyUnicode_1BYTE_KIND:
11073 {
11074 switch(kind2) {
11075 case PyUnicode_1BYTE_KIND:
11076 {
11077 int cmp = memcmp(data1, data2, len);
11078 /* normalize result of memcmp() into the range [-1; 1] */
11079 if (cmp < 0)
11080 return -1;
11081 if (cmp > 0)
11082 return 1;
11083 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011084 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011085 case PyUnicode_2BYTE_KIND:
11086 COMPARE(Py_UCS1, Py_UCS2);
11087 break;
11088 case PyUnicode_4BYTE_KIND:
11089 COMPARE(Py_UCS1, Py_UCS4);
11090 break;
11091 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011092 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011093 }
11094 break;
11095 }
11096 case PyUnicode_2BYTE_KIND:
11097 {
11098 switch(kind2) {
11099 case PyUnicode_1BYTE_KIND:
11100 COMPARE(Py_UCS2, Py_UCS1);
11101 break;
11102 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011103 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011104 COMPARE(Py_UCS2, Py_UCS2);
11105 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011106 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011107 case PyUnicode_4BYTE_KIND:
11108 COMPARE(Py_UCS2, Py_UCS4);
11109 break;
11110 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011111 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011112 }
11113 break;
11114 }
11115 case PyUnicode_4BYTE_KIND:
11116 {
11117 switch(kind2) {
11118 case PyUnicode_1BYTE_KIND:
11119 COMPARE(Py_UCS4, Py_UCS1);
11120 break;
11121 case PyUnicode_2BYTE_KIND:
11122 COMPARE(Py_UCS4, Py_UCS2);
11123 break;
11124 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011125 {
11126#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11127 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11128 /* normalize result of wmemcmp() into the range [-1; 1] */
11129 if (cmp < 0)
11130 return -1;
11131 if (cmp > 0)
11132 return 1;
11133#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011134 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011135#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011136 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011137 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011138 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011139 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011140 }
11141 break;
11142 }
11143 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011144 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011145 }
11146
Victor Stinner770e19e2012-10-04 22:59:45 +020011147 if (len1 == len2)
11148 return 0;
11149 if (len1 < len2)
11150 return -1;
11151 else
11152 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011153
11154#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011155}
11156
Benjamin Peterson621b4302016-09-09 13:54:34 -070011157static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011158unicode_compare_eq(PyObject *str1, PyObject *str2)
11159{
11160 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011161 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011162 Py_ssize_t len;
11163 int cmp;
11164
Victor Stinnere5567ad2012-10-23 02:48:49 +020011165 len = PyUnicode_GET_LENGTH(str1);
11166 if (PyUnicode_GET_LENGTH(str2) != len)
11167 return 0;
11168 kind = PyUnicode_KIND(str1);
11169 if (PyUnicode_KIND(str2) != kind)
11170 return 0;
11171 data1 = PyUnicode_DATA(str1);
11172 data2 = PyUnicode_DATA(str2);
11173
11174 cmp = memcmp(data1, data2, len * kind);
11175 return (cmp == 0);
11176}
11177
11178
Alexander Belopolsky40018472011-02-26 01:02:56 +000011179int
11180PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11183 if (PyUnicode_READY(left) == -1 ||
11184 PyUnicode_READY(right) == -1)
11185 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011186
11187 /* a string is equal to itself */
11188 if (left == right)
11189 return 0;
11190
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011191 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011193 PyErr_Format(PyExc_TypeError,
11194 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011195 Py_TYPE(left)->tp_name,
11196 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197 return -1;
11198}
11199
Martin v. Löwis5b222132007-06-10 09:51:05 +000011200int
11201PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11202{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203 Py_ssize_t i;
11204 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011206 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207
Victor Stinner910337b2011-10-03 03:20:16 +020011208 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011209 if (!PyUnicode_IS_READY(uni)) {
11210 const wchar_t *ws = _PyUnicode_WSTR(uni);
11211 /* Compare Unicode string and source character set string */
11212 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11213 if (chr != ustr[i])
11214 return (chr < ustr[i]) ? -1 : 1;
11215 }
11216 /* This check keeps Python strings that end in '\0' from comparing equal
11217 to C strings identical up to that point. */
11218 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11219 return 1; /* uni is longer */
11220 if (ustr[i])
11221 return -1; /* str is longer */
11222 return 0;
11223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011225 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011226 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011227 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011228 size_t len, len2 = strlen(str);
11229 int cmp;
11230
11231 len = Py_MIN(len1, len2);
11232 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011233 if (cmp != 0) {
11234 if (cmp < 0)
11235 return -1;
11236 else
11237 return 1;
11238 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011239 if (len1 > len2)
11240 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011241 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011242 return -1; /* str is longer */
11243 return 0;
11244 }
11245 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011246 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011247 /* Compare Unicode string and source character set string */
11248 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011249 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011250 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11251 /* This check keeps Python strings that end in '\0' from comparing equal
11252 to C strings identical up to that point. */
11253 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11254 return 1; /* uni is longer */
11255 if (str[i])
11256 return -1; /* str is longer */
11257 return 0;
11258 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011259}
11260
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011261static int
11262non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11263{
11264 size_t i, len;
11265 const wchar_t *p;
11266 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11267 if (strlen(str) != len)
11268 return 0;
11269 p = _PyUnicode_WSTR(unicode);
11270 assert(p);
11271 for (i = 0; i < len; i++) {
11272 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011273 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011274 return 0;
11275 }
11276 return 1;
11277}
11278
11279int
11280_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11281{
11282 size_t len;
11283 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011284 assert(str);
11285#ifndef NDEBUG
11286 for (const char *p = str; *p; p++) {
11287 assert((unsigned char)*p < 128);
11288 }
11289#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011290 if (PyUnicode_READY(unicode) == -1) {
11291 /* Memory error or bad data */
11292 PyErr_Clear();
11293 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11294 }
11295 if (!PyUnicode_IS_ASCII(unicode))
11296 return 0;
11297 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11298 return strlen(str) == len &&
11299 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11300}
11301
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011302int
11303_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11304{
11305 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011306
11307 assert(_PyUnicode_CHECK(left));
11308 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011309#ifndef NDEBUG
11310 for (const char *p = right->string; *p; p++) {
11311 assert((unsigned char)*p < 128);
11312 }
11313#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011314
11315 if (PyUnicode_READY(left) == -1) {
11316 /* memory error or bad data */
11317 PyErr_Clear();
11318 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11319 }
11320
11321 if (!PyUnicode_IS_ASCII(left))
11322 return 0;
11323
11324 right_uni = _PyUnicode_FromId(right); /* borrowed */
11325 if (right_uni == NULL) {
11326 /* memory error or bad data */
11327 PyErr_Clear();
11328 return _PyUnicode_EqualToASCIIString(left, right->string);
11329 }
11330
11331 if (left == right_uni)
11332 return 1;
11333
11334 if (PyUnicode_CHECK_INTERNED(left))
11335 return 0;
11336
Victor Stinner607b1022020-05-05 18:50:30 +020011337#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011338 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011339 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011340 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11341 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011342#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011343
11344 return unicode_compare_eq(left, right_uni);
11345}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011346
Alexander Belopolsky40018472011-02-26 01:02:56 +000011347PyObject *
11348PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011349{
11350 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011351
Victor Stinnere5567ad2012-10-23 02:48:49 +020011352 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11353 Py_RETURN_NOTIMPLEMENTED;
11354
11355 if (PyUnicode_READY(left) == -1 ||
11356 PyUnicode_READY(right) == -1)
11357 return NULL;
11358
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011359 if (left == right) {
11360 switch (op) {
11361 case Py_EQ:
11362 case Py_LE:
11363 case Py_GE:
11364 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011365 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011366 case Py_NE:
11367 case Py_LT:
11368 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011369 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011370 default:
11371 PyErr_BadArgument();
11372 return NULL;
11373 }
11374 }
11375 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011376 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011377 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011378 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011379 }
11380 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011381 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011382 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011383 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011384}
11385
Alexander Belopolsky40018472011-02-26 01:02:56 +000011386int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011387_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11388{
11389 return unicode_eq(aa, bb);
11390}
11391
11392int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011393PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011394{
Victor Stinner77282cb2013-04-14 19:22:47 +020011395 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011396 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011398 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011399
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011400 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011401 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011402 "'in <string>' requires string as left operand, not %.100s",
11403 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011404 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011405 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011406 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011407 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011408 if (ensure_unicode(str) < 0)
11409 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011412 kind2 = PyUnicode_KIND(substr);
11413 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011414 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011416 len2 = PyUnicode_GET_LENGTH(substr);
11417 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011418 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011419 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011420 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011421 if (len2 == 1) {
11422 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11423 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011424 return result;
11425 }
11426 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011427 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011428 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011429 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431
Victor Stinner77282cb2013-04-14 19:22:47 +020011432 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 case PyUnicode_1BYTE_KIND:
11434 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11435 break;
11436 case PyUnicode_2BYTE_KIND:
11437 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11438 break;
11439 case PyUnicode_4BYTE_KIND:
11440 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11441 break;
11442 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011443 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011445
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011446 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011447 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011448 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449
Guido van Rossum403d68b2000-03-13 15:55:09 +000011450 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011451}
11452
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453/* Concat to string or Unicode object giving a new Unicode object. */
11454
Alexander Belopolsky40018472011-02-26 01:02:56 +000011455PyObject *
11456PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011458 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011459 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011460 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011462 if (ensure_unicode(left) < 0)
11463 return NULL;
11464
11465 if (!PyUnicode_Check(right)) {
11466 PyErr_Format(PyExc_TypeError,
11467 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011468 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011469 return NULL;
11470 }
11471 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011472 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473
11474 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011475 if (left == unicode_empty)
11476 return PyUnicode_FromObject(right);
11477 if (right == unicode_empty)
11478 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011480 left_len = PyUnicode_GET_LENGTH(left);
11481 right_len = PyUnicode_GET_LENGTH(right);
11482 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011483 PyErr_SetString(PyExc_OverflowError,
11484 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011485 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011486 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011487 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011488
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011489 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11490 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011491 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011494 result = PyUnicode_New(new_len, maxchar);
11495 if (result == NULL)
11496 return NULL;
11497 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11498 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11499 assert(_PyUnicode_CheckConsistency(result, 1));
11500 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501}
11502
Walter Dörwald1ab83302007-05-18 17:15:44 +000011503void
Victor Stinner23e56682011-10-03 03:54:37 +020011504PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011505{
Victor Stinner23e56682011-10-03 03:54:37 +020011506 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011507 Py_UCS4 maxchar, maxchar2;
11508 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011509
11510 if (p_left == NULL) {
11511 if (!PyErr_Occurred())
11512 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011513 return;
11514 }
Victor Stinner23e56682011-10-03 03:54:37 +020011515 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011516 if (right == NULL || left == NULL
11517 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011518 if (!PyErr_Occurred())
11519 PyErr_BadInternalCall();
11520 goto error;
11521 }
11522
Benjamin Petersonbac79492012-01-14 13:34:47 -050011523 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011524 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011525 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011526 goto error;
11527
Victor Stinner488fa492011-12-12 00:01:39 +010011528 /* Shortcuts */
11529 if (left == unicode_empty) {
11530 Py_DECREF(left);
11531 Py_INCREF(right);
11532 *p_left = right;
11533 return;
11534 }
11535 if (right == unicode_empty)
11536 return;
11537
11538 left_len = PyUnicode_GET_LENGTH(left);
11539 right_len = PyUnicode_GET_LENGTH(right);
11540 if (left_len > PY_SSIZE_T_MAX - right_len) {
11541 PyErr_SetString(PyExc_OverflowError,
11542 "strings are too large to concat");
11543 goto error;
11544 }
11545 new_len = left_len + right_len;
11546
11547 if (unicode_modifiable(left)
11548 && PyUnicode_CheckExact(right)
11549 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011550 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11551 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011552 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011553 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011554 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11555 {
11556 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011557 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011558 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011559
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011560 /* copy 'right' into the newly allocated area of 'left' */
11561 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011562 }
Victor Stinner488fa492011-12-12 00:01:39 +010011563 else {
11564 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11565 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011566 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011567
Victor Stinner488fa492011-12-12 00:01:39 +010011568 /* Concat the two Unicode strings */
11569 res = PyUnicode_New(new_len, maxchar);
11570 if (res == NULL)
11571 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011572 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11573 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011574 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011575 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011576 }
11577 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011578 return;
11579
11580error:
Victor Stinner488fa492011-12-12 00:01:39 +010011581 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011582}
11583
11584void
11585PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11586{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011587 PyUnicode_Append(pleft, right);
11588 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011589}
11590
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011591/*
11592Wraps stringlib_parse_args_finds() and additionally ensures that the
11593first argument is a unicode object.
11594*/
11595
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011596static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011597parse_args_finds_unicode(const char * function_name, PyObject *args,
11598 PyObject **substring,
11599 Py_ssize_t *start, Py_ssize_t *end)
11600{
11601 if(stringlib_parse_args_finds(function_name, args, substring,
11602 start, end)) {
11603 if (ensure_unicode(*substring) < 0)
11604 return 0;
11605 return 1;
11606 }
11607 return 0;
11608}
11609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011610PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011611 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011613Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011614string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011615interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616
11617static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011618unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011620 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011621 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011622 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011624 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011625 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011628 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 kind1 = PyUnicode_KIND(self);
11632 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011633 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011634 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 len1 = PyUnicode_GET_LENGTH(self);
11637 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011639 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011640 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011641
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011642 buf1 = PyUnicode_DATA(self);
11643 buf2 = PyUnicode_DATA(substring);
11644 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011645 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011646 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011647 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011648 }
11649 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 case PyUnicode_1BYTE_KIND:
11651 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011652 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 buf2, len2, PY_SSIZE_T_MAX
11654 );
11655 break;
11656 case PyUnicode_2BYTE_KIND:
11657 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011658 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659 buf2, len2, PY_SSIZE_T_MAX
11660 );
11661 break;
11662 case PyUnicode_4BYTE_KIND:
11663 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011664 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 buf2, len2, PY_SSIZE_T_MAX
11666 );
11667 break;
11668 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011669 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 }
11671
11672 result = PyLong_FromSsize_t(iresult);
11673
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011674 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011675 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011676 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 return result;
11679}
11680
INADA Naoki3ae20562017-01-16 20:41:20 +090011681/*[clinic input]
11682str.encode as unicode_encode
11683
11684 encoding: str(c_default="NULL") = 'utf-8'
11685 The encoding in which to encode the string.
11686 errors: str(c_default="NULL") = 'strict'
11687 The error handling scheme to use for encoding errors.
11688 The default is 'strict' meaning that encoding errors raise a
11689 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11690 'xmlcharrefreplace' as well as any other name registered with
11691 codecs.register_error that can handle UnicodeEncodeErrors.
11692
11693Encode the string using the codec registered for encoding.
11694[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695
11696static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011697unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011698/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011700 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011701}
11702
INADA Naoki3ae20562017-01-16 20:41:20 +090011703/*[clinic input]
11704str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705
INADA Naoki3ae20562017-01-16 20:41:20 +090011706 tabsize: int = 8
11707
11708Return a copy where all tab characters are expanded using spaces.
11709
11710If tabsize is not given, a tab size of 8 characters is assumed.
11711[clinic start generated code]*/
11712
11713static PyObject *
11714unicode_expandtabs_impl(PyObject *self, int tabsize)
11715/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011717 Py_ssize_t i, j, line_pos, src_len, incr;
11718 Py_UCS4 ch;
11719 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011720 const void *src_data;
11721 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011722 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011723 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
Antoine Pitrou22425222011-10-04 19:10:51 +020011725 if (PyUnicode_READY(self) == -1)
11726 return NULL;
11727
Thomas Wouters7e474022000-07-16 12:04:32 +000011728 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011729 src_len = PyUnicode_GET_LENGTH(self);
11730 i = j = line_pos = 0;
11731 kind = PyUnicode_KIND(self);
11732 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011733 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011734 for (; i < src_len; i++) {
11735 ch = PyUnicode_READ(kind, src_data, i);
11736 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011737 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011739 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011741 goto overflow;
11742 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011743 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011744 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011748 goto overflow;
11749 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011751 if (ch == '\n' || ch == '\r')
11752 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011754 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011755 if (!found)
11756 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011757
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011759 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760 if (!u)
11761 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011762 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
Antoine Pitroue71d5742011-10-04 15:55:09 +020011764 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
Antoine Pitroue71d5742011-10-04 15:55:09 +020011766 for (; i < src_len; i++) {
11767 ch = PyUnicode_READ(kind, src_data, i);
11768 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011769 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011770 incr = tabsize - (line_pos % tabsize);
11771 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011772 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011773 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011775 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011777 line_pos++;
11778 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011779 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011780 if (ch == '\n' || ch == '\r')
11781 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011783 }
11784 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011785 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011786
Antoine Pitroue71d5742011-10-04 15:55:09 +020011787 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011788 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11789 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790}
11791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011792PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011793 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794\n\
11795Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011796such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797arguments start and end are interpreted as in slice notation.\n\
11798\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011799Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800
11801static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011804 /* initialize variables to prevent gcc warning */
11805 PyObject *substring = NULL;
11806 Py_ssize_t start = 0;
11807 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011808 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011810 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011813 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011816 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 if (result == -2)
11819 return NULL;
11820
Christian Heimes217cfd12007-12-02 14:31:20 +000011821 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822}
11823
11824static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011825unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011827 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011828 enum PyUnicode_Kind kind;
11829 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011830
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011831 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011832 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011834 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011835 if (PyUnicode_READY(self) == -1) {
11836 return NULL;
11837 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011838 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11839 PyErr_SetString(PyExc_IndexError, "string index out of range");
11840 return NULL;
11841 }
11842 kind = PyUnicode_KIND(self);
11843 data = PyUnicode_DATA(self);
11844 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011845 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846}
11847
Guido van Rossumc2504932007-09-18 19:42:40 +000011848/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011849 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011850static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011851unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011853 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011854
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011855#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011856 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011857#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 if (_PyUnicode_HASH(self) != -1)
11859 return _PyUnicode_HASH(self);
11860 if (PyUnicode_READY(self) == -1)
11861 return -1;
animalizea1d14252019-01-02 20:16:06 +080011862
Christian Heimes985ecdc2013-11-20 11:46:18 +010011863 x = _Py_HashBytes(PyUnicode_DATA(self),
11864 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011866 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867}
11868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011869PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871\n\
oldkaa0735f2018-02-02 16:52:55 +080011872Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011873such that sub is contained within S[start:end]. Optional\n\
11874arguments start and end are interpreted as in slice notation.\n\
11875\n\
11876Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877
11878static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011881 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011882 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011883 PyObject *substring = NULL;
11884 Py_ssize_t start = 0;
11885 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011887 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011890 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011893 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 if (result == -2)
11896 return NULL;
11897
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898 if (result < 0) {
11899 PyErr_SetString(PyExc_ValueError, "substring not found");
11900 return NULL;
11901 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011902
Christian Heimes217cfd12007-12-02 14:31:20 +000011903 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904}
11905
INADA Naoki3ae20562017-01-16 20:41:20 +090011906/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011907str.isascii as unicode_isascii
11908
11909Return True if all characters in the string are ASCII, False otherwise.
11910
11911ASCII characters have code points in the range U+0000-U+007F.
11912Empty string is ASCII too.
11913[clinic start generated code]*/
11914
11915static PyObject *
11916unicode_isascii_impl(PyObject *self)
11917/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11918{
11919 if (PyUnicode_READY(self) == -1) {
11920 return NULL;
11921 }
11922 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11923}
11924
11925/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011926str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927
INADA Naoki3ae20562017-01-16 20:41:20 +090011928Return True if the string is a lowercase string, False otherwise.
11929
11930A string is lowercase if all cased characters in the string are lowercase and
11931there is at least one cased character in the string.
11932[clinic start generated code]*/
11933
11934static PyObject *
11935unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011936/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 Py_ssize_t i, length;
11939 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011940 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 int cased;
11942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 if (PyUnicode_READY(self) == -1)
11944 return NULL;
11945 length = PyUnicode_GET_LENGTH(self);
11946 kind = PyUnicode_KIND(self);
11947 data = PyUnicode_DATA(self);
11948
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 if (length == 1)
11951 return PyBool_FromLong(
11952 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011954 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011956 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011957
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 for (i = 0; i < length; i++) {
11960 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011961
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011963 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 else if (!cased && Py_UNICODE_ISLOWER(ch))
11965 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011967 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968}
11969
INADA Naoki3ae20562017-01-16 20:41:20 +090011970/*[clinic input]
11971str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972
INADA Naoki3ae20562017-01-16 20:41:20 +090011973Return True if the string is an uppercase string, False otherwise.
11974
11975A string is uppercase if all cased characters in the string are uppercase and
11976there is at least one cased character in the string.
11977[clinic start generated code]*/
11978
11979static PyObject *
11980unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011981/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 Py_ssize_t i, length;
11984 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011985 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986 int cased;
11987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 if (PyUnicode_READY(self) == -1)
11989 return NULL;
11990 length = PyUnicode_GET_LENGTH(self);
11991 kind = PyUnicode_KIND(self);
11992 data = PyUnicode_DATA(self);
11993
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 if (length == 1)
11996 return PyBool_FromLong(
11997 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011999 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012001 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012002
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 for (i = 0; i < length; i++) {
12005 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012006
Benjamin Peterson29060642009-01-31 22:14:21 +000012007 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012008 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 else if (!cased && Py_UNICODE_ISUPPER(ch))
12010 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012012 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013}
12014
INADA Naoki3ae20562017-01-16 20:41:20 +090012015/*[clinic input]
12016str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017
INADA Naoki3ae20562017-01-16 20:41:20 +090012018Return True if the string is a title-cased string, False otherwise.
12019
12020In a title-cased string, upper- and title-case characters may only
12021follow uncased characters and lowercase characters only cased ones.
12022[clinic start generated code]*/
12023
12024static PyObject *
12025unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012026/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 Py_ssize_t i, length;
12029 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012030 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031 int cased, previous_is_cased;
12032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 if (PyUnicode_READY(self) == -1)
12034 return NULL;
12035 length = PyUnicode_GET_LENGTH(self);
12036 kind = PyUnicode_KIND(self);
12037 data = PyUnicode_DATA(self);
12038
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 if (length == 1) {
12041 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12042 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12043 (Py_UNICODE_ISUPPER(ch) != 0));
12044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012046 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012048 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012049
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050 cased = 0;
12051 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 for (i = 0; i < length; i++) {
12053 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012054
Benjamin Peterson29060642009-01-31 22:14:21 +000012055 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12056 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012057 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012058 previous_is_cased = 1;
12059 cased = 1;
12060 }
12061 else if (Py_UNICODE_ISLOWER(ch)) {
12062 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012063 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 previous_is_cased = 1;
12065 cased = 1;
12066 }
12067 else
12068 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012070 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071}
12072
INADA Naoki3ae20562017-01-16 20:41:20 +090012073/*[clinic input]
12074str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075
INADA Naoki3ae20562017-01-16 20:41:20 +090012076Return True if the string is a whitespace string, False otherwise.
12077
12078A string is whitespace if all characters in the string are whitespace and there
12079is at least one character in the string.
12080[clinic start generated code]*/
12081
12082static PyObject *
12083unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012084/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 Py_ssize_t i, length;
12087 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012088 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089
12090 if (PyUnicode_READY(self) == -1)
12091 return NULL;
12092 length = PyUnicode_GET_LENGTH(self);
12093 kind = PyUnicode_KIND(self);
12094 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 if (length == 1)
12098 return PyBool_FromLong(
12099 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012101 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012103 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 for (i = 0; i < length; i++) {
12106 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012107 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012108 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012110 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111}
12112
INADA Naoki3ae20562017-01-16 20:41:20 +090012113/*[clinic input]
12114str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012115
INADA Naoki3ae20562017-01-16 20:41:20 +090012116Return True if the string is an alphabetic string, False otherwise.
12117
12118A string is alphabetic if all characters in the string are alphabetic and there
12119is at least one character in the string.
12120[clinic start generated code]*/
12121
12122static PyObject *
12123unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012124/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012125{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 Py_ssize_t i, length;
12127 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012128 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129
12130 if (PyUnicode_READY(self) == -1)
12131 return NULL;
12132 length = PyUnicode_GET_LENGTH(self);
12133 kind = PyUnicode_KIND(self);
12134 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012135
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012136 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 if (length == 1)
12138 return PyBool_FromLong(
12139 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012140
12141 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012143 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 for (i = 0; i < length; i++) {
12146 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012147 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012148 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012149 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012150}
12151
INADA Naoki3ae20562017-01-16 20:41:20 +090012152/*[clinic input]
12153str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012154
INADA Naoki3ae20562017-01-16 20:41:20 +090012155Return True if the string is an alpha-numeric string, False otherwise.
12156
12157A string is alpha-numeric if all characters in the string are alpha-numeric and
12158there is at least one character in the string.
12159[clinic start generated code]*/
12160
12161static PyObject *
12162unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012163/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012166 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 Py_ssize_t len, i;
12168
12169 if (PyUnicode_READY(self) == -1)
12170 return NULL;
12171
12172 kind = PyUnicode_KIND(self);
12173 data = PyUnicode_DATA(self);
12174 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012175
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012176 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 if (len == 1) {
12178 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12179 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12180 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012181
12182 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012184 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 for (i = 0; i < len; i++) {
12187 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012188 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012189 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012190 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012191 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012192}
12193
INADA Naoki3ae20562017-01-16 20:41:20 +090012194/*[clinic input]
12195str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196
INADA Naoki3ae20562017-01-16 20:41:20 +090012197Return True if the string is a decimal string, False otherwise.
12198
12199A string is a decimal string if all characters in the string are decimal and
12200there is at least one character in the string.
12201[clinic start generated code]*/
12202
12203static PyObject *
12204unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012205/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 Py_ssize_t i, length;
12208 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012209 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210
12211 if (PyUnicode_READY(self) == -1)
12212 return NULL;
12213 length = PyUnicode_GET_LENGTH(self);
12214 kind = PyUnicode_KIND(self);
12215 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 if (length == 1)
12219 return PyBool_FromLong(
12220 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012222 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012224 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 for (i = 0; i < length; i++) {
12227 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012228 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012230 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231}
12232
INADA Naoki3ae20562017-01-16 20:41:20 +090012233/*[clinic input]
12234str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235
INADA Naoki3ae20562017-01-16 20:41:20 +090012236Return True if the string is a digit string, False otherwise.
12237
12238A string is a digit string if all characters in the string are digits and there
12239is at least one character in the string.
12240[clinic start generated code]*/
12241
12242static PyObject *
12243unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012244/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 Py_ssize_t i, length;
12247 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012248 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249
12250 if (PyUnicode_READY(self) == -1)
12251 return NULL;
12252 length = PyUnicode_GET_LENGTH(self);
12253 kind = PyUnicode_KIND(self);
12254 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 if (length == 1) {
12258 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12259 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012262 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012264 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 for (i = 0; i < length; i++) {
12267 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012268 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012270 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271}
12272
INADA Naoki3ae20562017-01-16 20:41:20 +090012273/*[clinic input]
12274str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275
INADA Naoki3ae20562017-01-16 20:41:20 +090012276Return True if the string is a numeric string, False otherwise.
12277
12278A string is numeric if all characters in the string are numeric and there is at
12279least one character in the string.
12280[clinic start generated code]*/
12281
12282static PyObject *
12283unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012284/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 Py_ssize_t i, length;
12287 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012288 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012289
12290 if (PyUnicode_READY(self) == -1)
12291 return NULL;
12292 length = PyUnicode_GET_LENGTH(self);
12293 kind = PyUnicode_KIND(self);
12294 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 if (length == 1)
12298 return PyBool_FromLong(
12299 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012301 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012303 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 for (i = 0; i < length; i++) {
12306 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012307 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012309 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310}
12311
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012312Py_ssize_t
12313_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012316 if (PyUnicode_READY(self) == -1)
12317 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012318
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012319 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012320 if (len == 0) {
12321 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 }
12324
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012325 int kind = PyUnicode_KIND(self);
12326 const void *data = PyUnicode_DATA(self);
12327 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012328 /* PEP 3131 says that the first character must be in
12329 XID_Start and subsequent characters in XID_Continue,
12330 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012331 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012332 letters, digits, underscore). However, given the current
12333 definition of XID_Start and XID_Continue, it is sufficient
12334 to check just for these, except that _ must be allowed
12335 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012336 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012337 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012338 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012339
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012340 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012341 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012342 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012343 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012344 }
12345 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012346 return i;
12347}
12348
12349int
12350PyUnicode_IsIdentifier(PyObject *self)
12351{
12352 if (PyUnicode_IS_READY(self)) {
12353 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12354 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12355 /* an empty string is not a valid identifier */
12356 return len && i == len;
12357 }
12358 else {
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012359 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012360 if (len == 0) {
12361 /* an empty string is not a valid identifier */
12362 return 0;
12363 }
12364
12365 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012366 Py_UCS4 ch = wstr[i++];
12367#if SIZEOF_WCHAR_T == 2
12368 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12369 && i < len
12370 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12371 {
12372 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12373 i++;
12374 }
12375#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012376 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12377 return 0;
12378 }
12379
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012380 while (i < len) {
12381 ch = wstr[i++];
12382#if SIZEOF_WCHAR_T == 2
12383 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12384 && i < len
12385 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12386 {
12387 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12388 i++;
12389 }
12390#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012391 if (!_PyUnicode_IsXidContinue(ch)) {
12392 return 0;
12393 }
12394 }
12395 return 1;
12396 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012397}
12398
INADA Naoki3ae20562017-01-16 20:41:20 +090012399/*[clinic input]
12400str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012401
INADA Naoki3ae20562017-01-16 20:41:20 +090012402Return True if the string is a valid Python identifier, False otherwise.
12403
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012404Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012405such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012406[clinic start generated code]*/
12407
12408static PyObject *
12409unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012410/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012411{
12412 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12413}
12414
INADA Naoki3ae20562017-01-16 20:41:20 +090012415/*[clinic input]
12416str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012417
INADA Naoki3ae20562017-01-16 20:41:20 +090012418Return True if the string is printable, False otherwise.
12419
12420A string is printable if all of its characters are considered printable in
12421repr() or if it is empty.
12422[clinic start generated code]*/
12423
12424static PyObject *
12425unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012426/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 Py_ssize_t i, length;
12429 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012430 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431
12432 if (PyUnicode_READY(self) == -1)
12433 return NULL;
12434 length = PyUnicode_GET_LENGTH(self);
12435 kind = PyUnicode_KIND(self);
12436 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012437
12438 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 if (length == 1)
12440 return PyBool_FromLong(
12441 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 for (i = 0; i < length; i++) {
12444 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012445 Py_RETURN_FALSE;
12446 }
12447 }
12448 Py_RETURN_TRUE;
12449}
12450
INADA Naoki3ae20562017-01-16 20:41:20 +090012451/*[clinic input]
12452str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453
INADA Naoki3ae20562017-01-16 20:41:20 +090012454 iterable: object
12455 /
12456
12457Concatenate any number of strings.
12458
Martin Panter91a88662017-01-24 00:30:06 +000012459The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012460The result is returned as a new string.
12461
12462Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12463[clinic start generated code]*/
12464
12465static PyObject *
12466unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012467/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468{
INADA Naoki3ae20562017-01-16 20:41:20 +090012469 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470}
12471
Martin v. Löwis18e16552006-02-15 17:27:45 +000012472static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012473unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 if (PyUnicode_READY(self) == -1)
12476 return -1;
12477 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478}
12479
INADA Naoki3ae20562017-01-16 20:41:20 +090012480/*[clinic input]
12481str.ljust as unicode_ljust
12482
12483 width: Py_ssize_t
12484 fillchar: Py_UCS4 = ' '
12485 /
12486
12487Return a left-justified string of length width.
12488
12489Padding is done using the specified fill character (default is a space).
12490[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491
12492static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012493unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12494/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012496 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498
Victor Stinnerc4b49542011-12-11 22:44:26 +010012499 if (PyUnicode_GET_LENGTH(self) >= width)
12500 return unicode_result_unchanged(self);
12501
12502 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503}
12504
INADA Naoki3ae20562017-01-16 20:41:20 +090012505/*[clinic input]
12506str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507
INADA Naoki3ae20562017-01-16 20:41:20 +090012508Return a copy of the string converted to lowercase.
12509[clinic start generated code]*/
12510
12511static PyObject *
12512unicode_lower_impl(PyObject *self)
12513/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012515 if (PyUnicode_READY(self) == -1)
12516 return NULL;
12517 if (PyUnicode_IS_ASCII(self))
12518 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012519 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520}
12521
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012522#define LEFTSTRIP 0
12523#define RIGHTSTRIP 1
12524#define BOTHSTRIP 2
12525
12526/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012527static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012528
INADA Naoki3ae20562017-01-16 20:41:20 +090012529#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012530
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012531/* externally visible for str.strip(unicode) */
12532PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012533_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012534{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012535 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 int kind;
12537 Py_ssize_t i, j, len;
12538 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012539 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12542 return NULL;
12543
12544 kind = PyUnicode_KIND(self);
12545 data = PyUnicode_DATA(self);
12546 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012547 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12549 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012550 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012551
Benjamin Peterson14339b62009-01-31 16:36:08 +000012552 i = 0;
12553 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012554 while (i < len) {
12555 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12556 if (!BLOOM(sepmask, ch))
12557 break;
12558 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12559 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 i++;
12561 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012562 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012563
Benjamin Peterson14339b62009-01-31 16:36:08 +000012564 j = len;
12565 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012566 j--;
12567 while (j >= i) {
12568 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12569 if (!BLOOM(sepmask, ch))
12570 break;
12571 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12572 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012574 }
12575
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012577 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012578
Victor Stinner7931d9a2011-11-04 00:22:48 +010012579 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580}
12581
12582PyObject*
12583PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12584{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012585 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012587 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588
Victor Stinnerde636f32011-10-01 03:55:54 +020012589 if (PyUnicode_READY(self) == -1)
12590 return NULL;
12591
Victor Stinner684d5fd2012-05-03 02:32:34 +020012592 length = PyUnicode_GET_LENGTH(self);
12593 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012594
Victor Stinner684d5fd2012-05-03 02:32:34 +020012595 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012596 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597
Victor Stinnerde636f32011-10-01 03:55:54 +020012598 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012599 PyErr_SetString(PyExc_IndexError, "string index out of range");
12600 return NULL;
12601 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012602 if (start >= length || end < start)
12603 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012604
Victor Stinner684d5fd2012-05-03 02:32:34 +020012605 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012606 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012607 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012608 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012609 }
12610 else {
12611 kind = PyUnicode_KIND(self);
12612 data = PyUnicode_1BYTE_DATA(self);
12613 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012614 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012615 length);
12616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618
12619static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012620do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 Py_ssize_t len, i, j;
12623
12624 if (PyUnicode_READY(self) == -1)
12625 return NULL;
12626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012628
Victor Stinnercc7af722013-04-09 22:39:24 +020012629 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012630 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012631
12632 i = 0;
12633 if (striptype != RIGHTSTRIP) {
12634 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012635 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012636 if (!_Py_ascii_whitespace[ch])
12637 break;
12638 i++;
12639 }
12640 }
12641
12642 j = len;
12643 if (striptype != LEFTSTRIP) {
12644 j--;
12645 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012646 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012647 if (!_Py_ascii_whitespace[ch])
12648 break;
12649 j--;
12650 }
12651 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012652 }
12653 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012654 else {
12655 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012656 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012657
Victor Stinnercc7af722013-04-09 22:39:24 +020012658 i = 0;
12659 if (striptype != RIGHTSTRIP) {
12660 while (i < len) {
12661 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12662 if (!Py_UNICODE_ISSPACE(ch))
12663 break;
12664 i++;
12665 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012666 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012667
12668 j = len;
12669 if (striptype != LEFTSTRIP) {
12670 j--;
12671 while (j >= i) {
12672 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12673 if (!Py_UNICODE_ISSPACE(ch))
12674 break;
12675 j--;
12676 }
12677 j++;
12678 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012679 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012680
Victor Stinner7931d9a2011-11-04 00:22:48 +010012681 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682}
12683
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012684
12685static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012686do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012687{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012688 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012689 if (PyUnicode_Check(sep))
12690 return _PyUnicode_XStrip(self, striptype, sep);
12691 else {
12692 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 "%s arg must be None or str",
12694 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012695 return NULL;
12696 }
12697 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012698
Benjamin Peterson14339b62009-01-31 16:36:08 +000012699 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012700}
12701
12702
INADA Naoki3ae20562017-01-16 20:41:20 +090012703/*[clinic input]
12704str.strip as unicode_strip
12705
12706 chars: object = None
12707 /
12708
Zachary Ware09895c22019-10-09 16:09:00 -050012709Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012710
12711If chars is given and not None, remove characters in chars instead.
12712[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012713
12714static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012715unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012716/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012717{
INADA Naoki3ae20562017-01-16 20:41:20 +090012718 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012719}
12720
12721
INADA Naoki3ae20562017-01-16 20:41:20 +090012722/*[clinic input]
12723str.lstrip as unicode_lstrip
12724
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012725 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012726 /
12727
12728Return a copy of the string with leading whitespace removed.
12729
12730If chars is given and not None, remove characters in chars instead.
12731[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012732
12733static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012734unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012735/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012736{
INADA Naoki3ae20562017-01-16 20:41:20 +090012737 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012738}
12739
12740
INADA Naoki3ae20562017-01-16 20:41:20 +090012741/*[clinic input]
12742str.rstrip as unicode_rstrip
12743
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012744 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012745 /
12746
12747Return a copy of the string with trailing whitespace removed.
12748
12749If chars is given and not None, remove characters in chars instead.
12750[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012751
12752static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012753unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012754/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012755{
INADA Naoki3ae20562017-01-16 20:41:20 +090012756 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012757}
12758
12759
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012761unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012763 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765
Serhiy Storchaka05997252013-01-26 12:14:02 +020012766 if (len < 1)
12767 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768
Victor Stinnerc4b49542011-12-11 22:44:26 +010012769 /* no repeat, return original string */
12770 if (len == 1)
12771 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012772
Benjamin Petersonbac79492012-01-14 13:34:47 -050012773 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774 return NULL;
12775
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012776 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012777 PyErr_SetString(PyExc_OverflowError,
12778 "repeated string is too long");
12779 return NULL;
12780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012782
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012783 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784 if (!u)
12785 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012786 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012789 int kind = PyUnicode_KIND(str);
12790 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012791 if (kind == PyUnicode_1BYTE_KIND) {
12792 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012793 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012794 }
12795 else if (kind == PyUnicode_2BYTE_KIND) {
12796 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012797 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012798 ucs2[n] = fill_char;
12799 } else {
12800 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12801 assert(kind == PyUnicode_4BYTE_KIND);
12802 for (n = 0; n < len; ++n)
12803 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012805 }
12806 else {
12807 /* number of characters copied this far */
12808 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012809 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012810 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012811 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012813 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012815 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012816 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012817 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818 }
12819
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012820 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012821 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822}
12823
Alexander Belopolsky40018472011-02-26 01:02:56 +000012824PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012825PyUnicode_Replace(PyObject *str,
12826 PyObject *substr,
12827 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012828 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012830 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12831 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012832 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012833 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834}
12835
INADA Naoki3ae20562017-01-16 20:41:20 +090012836/*[clinic input]
12837str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838
INADA Naoki3ae20562017-01-16 20:41:20 +090012839 old: unicode
12840 new: unicode
12841 count: Py_ssize_t = -1
12842 Maximum number of occurrences to replace.
12843 -1 (the default value) means replace all occurrences.
12844 /
12845
12846Return a copy with all occurrences of substring old replaced by new.
12847
12848If the optional argument count is given, only the first count occurrences are
12849replaced.
12850[clinic start generated code]*/
12851
12852static PyObject *
12853unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12854 Py_ssize_t count)
12855/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012857 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012858 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012859 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860}
12861
sweeneydea81849b2020-04-22 17:05:48 -040012862/*[clinic input]
12863str.removeprefix as unicode_removeprefix
12864
12865 prefix: unicode
12866 /
12867
12868Return a str with the given prefix string removed if present.
12869
12870If the string starts with the prefix string, return string[len(prefix):].
12871Otherwise, return a copy of the original string.
12872[clinic start generated code]*/
12873
12874static PyObject *
12875unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12876/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12877{
12878 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12879 if (match == -1) {
12880 return NULL;
12881 }
12882 if (match) {
12883 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12884 PyUnicode_GET_LENGTH(self));
12885 }
12886 return unicode_result_unchanged(self);
12887}
12888
12889/*[clinic input]
12890str.removesuffix as unicode_removesuffix
12891
12892 suffix: unicode
12893 /
12894
12895Return a str with the given suffix string removed if present.
12896
12897If the string ends with the suffix string and that suffix is not empty,
12898return string[:-len(suffix)]. Otherwise, return a copy of the original
12899string.
12900[clinic start generated code]*/
12901
12902static PyObject *
12903unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12904/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12905{
12906 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12907 if (match == -1) {
12908 return NULL;
12909 }
12910 if (match) {
12911 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12912 - PyUnicode_GET_LENGTH(suffix));
12913 }
12914 return unicode_result_unchanged(self);
12915}
12916
Alexander Belopolsky40018472011-02-26 01:02:56 +000012917static PyObject *
12918unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012919{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012920 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012921 Py_ssize_t isize;
12922 Py_ssize_t osize, squote, dquote, i, o;
12923 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012924 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012925 const void *idata;
12926 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012929 return NULL;
12930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931 isize = PyUnicode_GET_LENGTH(unicode);
12932 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 /* Compute length of output, quote characters, and
12935 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012936 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 max = 127;
12938 squote = dquote = 0;
12939 ikind = PyUnicode_KIND(unicode);
12940 for (i = 0; i < isize; i++) {
12941 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012942 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012943 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012944 case '\'': squote++; break;
12945 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012947 incr = 2;
12948 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 default:
12950 /* Fast-path ASCII */
12951 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012952 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012954 ;
12955 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012958 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012960 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012962 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012963 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012964 if (osize > PY_SSIZE_T_MAX - incr) {
12965 PyErr_SetString(PyExc_OverflowError,
12966 "string is too long to generate repr");
12967 return NULL;
12968 }
12969 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 }
12971
12972 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012973 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012975 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 if (dquote)
12977 /* Both squote and dquote present. Use squote,
12978 and escape them */
12979 osize += squote;
12980 else
12981 quote = '"';
12982 }
Victor Stinner55c08782013-04-14 18:45:39 +020012983 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984
12985 repr = PyUnicode_New(osize, max);
12986 if (repr == NULL)
12987 return NULL;
12988 okind = PyUnicode_KIND(repr);
12989 odata = PyUnicode_DATA(repr);
12990
12991 PyUnicode_WRITE(okind, odata, 0, quote);
12992 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012993 if (unchanged) {
12994 _PyUnicode_FastCopyCharacters(repr, 1,
12995 unicode, 0,
12996 isize);
12997 }
12998 else {
12999 for (i = 0, o = 1; i < isize; i++) {
13000 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001
Victor Stinner55c08782013-04-14 18:45:39 +020013002 /* Escape quotes and backslashes */
13003 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013004 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013006 continue;
13007 }
13008
13009 /* Map special whitespace to '\t', \n', '\r' */
13010 if (ch == '\t') {
13011 PyUnicode_WRITE(okind, odata, o++, '\\');
13012 PyUnicode_WRITE(okind, odata, o++, 't');
13013 }
13014 else if (ch == '\n') {
13015 PyUnicode_WRITE(okind, odata, o++, '\\');
13016 PyUnicode_WRITE(okind, odata, o++, 'n');
13017 }
13018 else if (ch == '\r') {
13019 PyUnicode_WRITE(okind, odata, o++, '\\');
13020 PyUnicode_WRITE(okind, odata, o++, 'r');
13021 }
13022
13023 /* Map non-printable US ASCII to '\xhh' */
13024 else if (ch < ' ' || ch == 0x7F) {
13025 PyUnicode_WRITE(okind, odata, o++, '\\');
13026 PyUnicode_WRITE(okind, odata, o++, 'x');
13027 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13028 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13029 }
13030
13031 /* Copy ASCII characters as-is */
13032 else if (ch < 0x7F) {
13033 PyUnicode_WRITE(okind, odata, o++, ch);
13034 }
13035
13036 /* Non-ASCII characters */
13037 else {
13038 /* Map Unicode whitespace and control characters
13039 (categories Z* and C* except ASCII space)
13040 */
13041 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13042 PyUnicode_WRITE(okind, odata, o++, '\\');
13043 /* Map 8-bit characters to '\xhh' */
13044 if (ch <= 0xff) {
13045 PyUnicode_WRITE(okind, odata, o++, 'x');
13046 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13047 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13048 }
13049 /* Map 16-bit characters to '\uxxxx' */
13050 else if (ch <= 0xffff) {
13051 PyUnicode_WRITE(okind, odata, o++, 'u');
13052 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13053 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13056 }
13057 /* Map 21-bit characters to '\U00xxxxxx' */
13058 else {
13059 PyUnicode_WRITE(okind, odata, o++, 'U');
13060 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13064 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13065 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13066 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13067 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13068 }
13069 }
13070 /* Copy characters as-is */
13071 else {
13072 PyUnicode_WRITE(okind, odata, o++, ch);
13073 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013074 }
13075 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013076 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013078 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013079 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080}
13081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013082PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013083 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084\n\
13085Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013086such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087arguments start and end are interpreted as in slice notation.\n\
13088\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013089Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090
13091static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013093{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013094 /* initialize variables to prevent gcc warning */
13095 PyObject *substring = NULL;
13096 Py_ssize_t start = 0;
13097 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013098 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013100 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013103 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013106 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108 if (result == -2)
13109 return NULL;
13110
Christian Heimes217cfd12007-12-02 14:31:20 +000013111 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112}
13113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013114PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013115 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013117Return the highest index in S where substring sub is found,\n\
13118such that sub is contained within S[start:end]. Optional\n\
13119arguments start and end are interpreted as in slice notation.\n\
13120\n\
13121Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122
13123static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013126 /* initialize variables to prevent gcc warning */
13127 PyObject *substring = NULL;
13128 Py_ssize_t start = 0;
13129 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013130 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013132 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013133 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013135 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013138 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 if (result == -2)
13141 return NULL;
13142
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143 if (result < 0) {
13144 PyErr_SetString(PyExc_ValueError, "substring not found");
13145 return NULL;
13146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147
Christian Heimes217cfd12007-12-02 14:31:20 +000013148 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149}
13150
INADA Naoki3ae20562017-01-16 20:41:20 +090013151/*[clinic input]
13152str.rjust as unicode_rjust
13153
13154 width: Py_ssize_t
13155 fillchar: Py_UCS4 = ' '
13156 /
13157
13158Return a right-justified string of length width.
13159
13160Padding is done using the specified fill character (default is a space).
13161[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162
13163static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013164unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13165/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013166{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013167 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168 return NULL;
13169
Victor Stinnerc4b49542011-12-11 22:44:26 +010013170 if (PyUnicode_GET_LENGTH(self) >= width)
13171 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172
Victor Stinnerc4b49542011-12-11 22:44:26 +010013173 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174}
13175
Alexander Belopolsky40018472011-02-26 01:02:56 +000013176PyObject *
13177PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013178{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013179 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013180 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013181
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013182 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183}
13184
INADA Naoki3ae20562017-01-16 20:41:20 +090013185/*[clinic input]
13186str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187
INADA Naoki3ae20562017-01-16 20:41:20 +090013188 sep: object = None
13189 The delimiter according which to split the string.
13190 None (the default value) means split according to any whitespace,
13191 and discard empty strings from the result.
13192 maxsplit: Py_ssize_t = -1
13193 Maximum number of splits to do.
13194 -1 (the default value) means no limit.
13195
13196Return a list of the words in the string, using sep as the delimiter string.
13197[clinic start generated code]*/
13198
13199static PyObject *
13200unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13201/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202{
INADA Naoki3ae20562017-01-16 20:41:20 +090013203 if (sep == Py_None)
13204 return split(self, NULL, maxsplit);
13205 if (PyUnicode_Check(sep))
13206 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013207
Victor Stinner998b8062018-09-12 00:23:25 +020013208 PyErr_Format(PyExc_TypeError,
13209 "must be str or None, not %.100s",
13210 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212}
13213
Thomas Wouters477c8d52006-05-27 19:21:47 +000013214PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013215PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013216{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013217 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013218 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013219 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013220 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013221
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013222 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013223 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013224
Victor Stinner14f8f022011-10-05 20:58:25 +020013225 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013226 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227 len1 = PyUnicode_GET_LENGTH(str_obj);
13228 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013229 if (kind1 < kind2 || len1 < len2) {
13230 _Py_INCREF_UNICODE_EMPTY();
13231 if (!unicode_empty)
13232 out = NULL;
13233 else {
13234 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13235 Py_DECREF(unicode_empty);
13236 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013237 return out;
13238 }
13239 buf1 = PyUnicode_DATA(str_obj);
13240 buf2 = PyUnicode_DATA(sep_obj);
13241 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013242 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013243 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013244 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013246
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013247 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013248 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013249 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13250 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13251 else
13252 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013253 break;
13254 case PyUnicode_2BYTE_KIND:
13255 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13256 break;
13257 case PyUnicode_4BYTE_KIND:
13258 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13259 break;
13260 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013261 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013262 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013263
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013264 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013265 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013266 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013267
13268 return out;
13269}
13270
13271
13272PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013273PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013274{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013275 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013276 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013277 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013278 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013279
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013280 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013281 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013282
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013283 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013284 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013285 len1 = PyUnicode_GET_LENGTH(str_obj);
13286 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013287 if (kind1 < kind2 || len1 < len2) {
13288 _Py_INCREF_UNICODE_EMPTY();
13289 if (!unicode_empty)
13290 out = NULL;
13291 else {
13292 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13293 Py_DECREF(unicode_empty);
13294 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013295 return out;
13296 }
13297 buf1 = PyUnicode_DATA(str_obj);
13298 buf2 = PyUnicode_DATA(sep_obj);
13299 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013300 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013301 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013302 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013304
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013305 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013307 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13308 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13309 else
13310 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311 break;
13312 case PyUnicode_2BYTE_KIND:
13313 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13314 break;
13315 case PyUnicode_4BYTE_KIND:
13316 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13317 break;
13318 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013319 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013320 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013321
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013322 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013323 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013324 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013325
13326 return out;
13327}
13328
INADA Naoki3ae20562017-01-16 20:41:20 +090013329/*[clinic input]
13330str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013331
INADA Naoki3ae20562017-01-16 20:41:20 +090013332 sep: object
13333 /
13334
13335Partition the string into three parts using the given separator.
13336
13337This will search for the separator in the string. If the separator is found,
13338returns a 3-tuple containing the part before the separator, the separator
13339itself, and the part after it.
13340
13341If the separator is not found, returns a 3-tuple containing the original string
13342and two empty strings.
13343[clinic start generated code]*/
13344
13345static PyObject *
13346unicode_partition(PyObject *self, PyObject *sep)
13347/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013348{
INADA Naoki3ae20562017-01-16 20:41:20 +090013349 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013350}
13351
INADA Naoki3ae20562017-01-16 20:41:20 +090013352/*[clinic input]
13353str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013354
INADA Naoki3ae20562017-01-16 20:41:20 +090013355Partition the string into three parts using the given separator.
13356
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013357This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013358the separator is found, returns a 3-tuple containing the part before the
13359separator, the separator itself, and the part after it.
13360
13361If the separator is not found, returns a 3-tuple containing two empty strings
13362and the original string.
13363[clinic start generated code]*/
13364
13365static PyObject *
13366unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013367/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013368{
INADA Naoki3ae20562017-01-16 20:41:20 +090013369 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013370}
13371
Alexander Belopolsky40018472011-02-26 01:02:56 +000013372PyObject *
13373PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013374{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013375 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013376 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013377
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013378 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013379}
13380
INADA Naoki3ae20562017-01-16 20:41:20 +090013381/*[clinic input]
13382str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013383
INADA Naoki3ae20562017-01-16 20:41:20 +090013384Return a list of the words in the string, using sep as the delimiter string.
13385
13386Splits are done starting at the end of the string and working to the front.
13387[clinic start generated code]*/
13388
13389static PyObject *
13390unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13391/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013392{
INADA Naoki3ae20562017-01-16 20:41:20 +090013393 if (sep == Py_None)
13394 return rsplit(self, NULL, maxsplit);
13395 if (PyUnicode_Check(sep))
13396 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013397
Victor Stinner998b8062018-09-12 00:23:25 +020013398 PyErr_Format(PyExc_TypeError,
13399 "must be str or None, not %.100s",
13400 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013401 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013402}
13403
INADA Naoki3ae20562017-01-16 20:41:20 +090013404/*[clinic input]
13405str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013407 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013408
13409Return a list of the lines in the string, breaking at line boundaries.
13410
13411Line breaks are not included in the resulting list unless keepends is given and
13412true.
13413[clinic start generated code]*/
13414
13415static PyObject *
13416unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013417/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013418{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013419 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420}
13421
13422static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013423PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013424{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013425 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426}
13427
INADA Naoki3ae20562017-01-16 20:41:20 +090013428/*[clinic input]
13429str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013430
INADA Naoki3ae20562017-01-16 20:41:20 +090013431Convert uppercase characters to lowercase and lowercase characters to uppercase.
13432[clinic start generated code]*/
13433
13434static PyObject *
13435unicode_swapcase_impl(PyObject *self)
13436/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013438 if (PyUnicode_READY(self) == -1)
13439 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013440 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441}
13442
Larry Hastings61272b72014-01-07 12:41:53 -080013443/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013444
Larry Hastings31826802013-10-19 00:09:25 -070013445@staticmethod
13446str.maketrans as unicode_maketrans
13447
13448 x: object
13449
13450 y: unicode=NULL
13451
13452 z: unicode=NULL
13453
13454 /
13455
13456Return a translation table usable for str.translate().
13457
13458If there is only one argument, it must be a dictionary mapping Unicode
13459ordinals (integers) or characters to Unicode ordinals, strings or None.
13460Character keys will be then converted to ordinals.
13461If there are two arguments, they must be strings of equal length, and
13462in the resulting dictionary, each character in x will be mapped to the
13463character at the same position in y. If there is a third argument, it
13464must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013465[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013466
Larry Hastings31826802013-10-19 00:09:25 -070013467static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013468unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013469/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013470{
Georg Brandlceee0772007-11-27 23:48:05 +000013471 PyObject *new = NULL, *key, *value;
13472 Py_ssize_t i = 0;
13473 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013474
Georg Brandlceee0772007-11-27 23:48:05 +000013475 new = PyDict_New();
13476 if (!new)
13477 return NULL;
13478 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013479 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013480 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013481
Georg Brandlceee0772007-11-27 23:48:05 +000013482 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013483 if (!PyUnicode_Check(x)) {
13484 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13485 "be a string if there is a second argument");
13486 goto err;
13487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013488 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013489 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13490 "arguments must have equal length");
13491 goto err;
13492 }
13493 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013494 x_kind = PyUnicode_KIND(x);
13495 y_kind = PyUnicode_KIND(y);
13496 x_data = PyUnicode_DATA(x);
13497 y_data = PyUnicode_DATA(y);
13498 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13499 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013500 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013501 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013502 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013503 if (!value) {
13504 Py_DECREF(key);
13505 goto err;
13506 }
Georg Brandlceee0772007-11-27 23:48:05 +000013507 res = PyDict_SetItem(new, key, value);
13508 Py_DECREF(key);
13509 Py_DECREF(value);
13510 if (res < 0)
13511 goto err;
13512 }
13513 /* create entries for deleting chars in z */
13514 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013515 z_kind = PyUnicode_KIND(z);
13516 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013517 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013518 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013519 if (!key)
13520 goto err;
13521 res = PyDict_SetItem(new, key, Py_None);
13522 Py_DECREF(key);
13523 if (res < 0)
13524 goto err;
13525 }
13526 }
13527 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013528 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013529 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013530
Georg Brandlceee0772007-11-27 23:48:05 +000013531 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013532 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013533 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13534 "to maketrans it must be a dict");
13535 goto err;
13536 }
13537 /* copy entries into the new dict, converting string keys to int keys */
13538 while (PyDict_Next(x, &i, &key, &value)) {
13539 if (PyUnicode_Check(key)) {
13540 /* convert string keys to integer keys */
13541 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013542 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013543 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13544 "table must be of length 1");
13545 goto err;
13546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013547 kind = PyUnicode_KIND(key);
13548 data = PyUnicode_DATA(key);
13549 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013550 if (!newkey)
13551 goto err;
13552 res = PyDict_SetItem(new, newkey, value);
13553 Py_DECREF(newkey);
13554 if (res < 0)
13555 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013556 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013557 /* just keep integer keys */
13558 if (PyDict_SetItem(new, key, value) < 0)
13559 goto err;
13560 } else {
13561 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13562 "be strings or integers");
13563 goto err;
13564 }
13565 }
13566 }
13567 return new;
13568 err:
13569 Py_DECREF(new);
13570 return NULL;
13571}
13572
INADA Naoki3ae20562017-01-16 20:41:20 +090013573/*[clinic input]
13574str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013575
INADA Naoki3ae20562017-01-16 20:41:20 +090013576 table: object
13577 Translation table, which must be a mapping of Unicode ordinals to
13578 Unicode ordinals, strings, or None.
13579 /
13580
13581Replace each character in the string using the given translation table.
13582
13583The table must implement lookup/indexing via __getitem__, for instance a
13584dictionary or list. If this operation raises LookupError, the character is
13585left untouched. Characters mapped to None are deleted.
13586[clinic start generated code]*/
13587
13588static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013589unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013590/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013592 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013593}
13594
INADA Naoki3ae20562017-01-16 20:41:20 +090013595/*[clinic input]
13596str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013597
INADA Naoki3ae20562017-01-16 20:41:20 +090013598Return a copy of the string converted to uppercase.
13599[clinic start generated code]*/
13600
13601static PyObject *
13602unicode_upper_impl(PyObject *self)
13603/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013604{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013605 if (PyUnicode_READY(self) == -1)
13606 return NULL;
13607 if (PyUnicode_IS_ASCII(self))
13608 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013609 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013610}
13611
INADA Naoki3ae20562017-01-16 20:41:20 +090013612/*[clinic input]
13613str.zfill as unicode_zfill
13614
13615 width: Py_ssize_t
13616 /
13617
13618Pad a numeric string with zeros on the left, to fill a field of the given width.
13619
13620The string is never truncated.
13621[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013622
13623static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013624unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013625/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013626{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013627 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013628 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013629 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013630 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 Py_UCS4 chr;
13632
Benjamin Petersonbac79492012-01-14 13:34:47 -050013633 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013635
Victor Stinnerc4b49542011-12-11 22:44:26 +010013636 if (PyUnicode_GET_LENGTH(self) >= width)
13637 return unicode_result_unchanged(self);
13638
13639 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013640
13641 u = pad(self, fill, 0, '0');
13642
Walter Dörwald068325e2002-04-15 13:36:47 +000013643 if (u == NULL)
13644 return NULL;
13645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013646 kind = PyUnicode_KIND(u);
13647 data = PyUnicode_DATA(u);
13648 chr = PyUnicode_READ(kind, data, fill);
13649
13650 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013651 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013652 PyUnicode_WRITE(kind, data, 0, chr);
13653 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013654 }
13655
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013656 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013657 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013658}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013659
13660#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013661static PyObject *
13662unicode__decimal2ascii(PyObject *self)
13663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013664 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013665}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013666#endif
13667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013668PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013669 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013670\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013671Return True if S starts with the specified prefix, False otherwise.\n\
13672With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013673With optional end, stop comparing S at that position.\n\
13674prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013675
13676static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013677unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013678 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013680 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013681 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013682 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013683 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013684 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013685
Jesus Ceaac451502011-04-20 17:09:23 +020013686 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013687 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013688 if (PyTuple_Check(subobj)) {
13689 Py_ssize_t i;
13690 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013691 substring = PyTuple_GET_ITEM(subobj, i);
13692 if (!PyUnicode_Check(substring)) {
13693 PyErr_Format(PyExc_TypeError,
13694 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013695 "not %.100s",
13696 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013697 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013698 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013699 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013700 if (result == -1)
13701 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013702 if (result) {
13703 Py_RETURN_TRUE;
13704 }
13705 }
13706 /* nothing matched */
13707 Py_RETURN_FALSE;
13708 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013709 if (!PyUnicode_Check(subobj)) {
13710 PyErr_Format(PyExc_TypeError,
13711 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013712 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013714 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013715 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013716 if (result == -1)
13717 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013718 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013719}
13720
13721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013722PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013723 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013724\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013725Return True if S ends with the specified suffix, False otherwise.\n\
13726With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013727With optional end, stop comparing S at that position.\n\
13728suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013729
13730static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013731unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013732 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013733{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013734 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013735 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013736 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013737 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013738 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013739
Jesus Ceaac451502011-04-20 17:09:23 +020013740 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013741 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013742 if (PyTuple_Check(subobj)) {
13743 Py_ssize_t i;
13744 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013745 substring = PyTuple_GET_ITEM(subobj, i);
13746 if (!PyUnicode_Check(substring)) {
13747 PyErr_Format(PyExc_TypeError,
13748 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013749 "not %.100s",
13750 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013751 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013752 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013753 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013754 if (result == -1)
13755 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013756 if (result) {
13757 Py_RETURN_TRUE;
13758 }
13759 }
13760 Py_RETURN_FALSE;
13761 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013762 if (!PyUnicode_Check(subobj)) {
13763 PyErr_Format(PyExc_TypeError,
13764 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013765 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013766 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013767 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013768 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013769 if (result == -1)
13770 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013771 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013772}
13773
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013774static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013775_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013776{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013777 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13778 writer->data = PyUnicode_DATA(writer->buffer);
13779
13780 if (!writer->readonly) {
13781 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013782 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013783 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013784 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013785 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13786 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13787 writer->kind = PyUnicode_WCHAR_KIND;
13788 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13789
Victor Stinner8f674cc2013-04-17 23:02:17 +020013790 /* Copy-on-write mode: set buffer size to 0 so
13791 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13792 * next write. */
13793 writer->size = 0;
13794 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013795}
13796
Victor Stinnerd3f08822012-05-29 12:57:52 +020013797void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013798_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013799{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013800 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013801
13802 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013803 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013804
13805 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13806 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13807 writer->kind = PyUnicode_WCHAR_KIND;
13808 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013809}
13810
Inada Naoki770847a2019-06-24 12:30:24 +090013811// Initialize _PyUnicodeWriter with initial buffer
13812static inline void
13813_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13814{
13815 memset(writer, 0, sizeof(*writer));
13816 writer->buffer = buffer;
13817 _PyUnicodeWriter_Update(writer);
13818 writer->min_length = writer->size;
13819}
13820
Victor Stinnerd3f08822012-05-29 12:57:52 +020013821int
13822_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13823 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013824{
13825 Py_ssize_t newlen;
13826 PyObject *newbuffer;
13827
Victor Stinner2740e462016-09-06 16:58:36 -070013828 assert(maxchar <= MAX_UNICODE);
13829
Victor Stinnerca9381e2015-09-22 00:58:32 +020013830 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013831 assert((maxchar > writer->maxchar && length >= 0)
13832 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013833
Victor Stinner202fdca2012-05-07 12:47:02 +020013834 if (length > PY_SSIZE_T_MAX - writer->pos) {
13835 PyErr_NoMemory();
13836 return -1;
13837 }
13838 newlen = writer->pos + length;
13839
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013840 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013841
Victor Stinnerd3f08822012-05-29 12:57:52 +020013842 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013843 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013844 if (writer->overallocate
13845 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13846 /* overallocate to limit the number of realloc() */
13847 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013848 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013849 if (newlen < writer->min_length)
13850 newlen = writer->min_length;
13851
Victor Stinnerd3f08822012-05-29 12:57:52 +020013852 writer->buffer = PyUnicode_New(newlen, maxchar);
13853 if (writer->buffer == NULL)
13854 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013855 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013856 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013857 if (writer->overallocate
13858 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13859 /* overallocate to limit the number of realloc() */
13860 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013861 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013862 if (newlen < writer->min_length)
13863 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013864
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013865 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013866 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013867 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013868 newbuffer = PyUnicode_New(newlen, maxchar);
13869 if (newbuffer == NULL)
13870 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013871 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13872 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013873 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013874 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013875 }
13876 else {
13877 newbuffer = resize_compact(writer->buffer, newlen);
13878 if (newbuffer == NULL)
13879 return -1;
13880 }
13881 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013882 }
13883 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013884 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013885 newbuffer = PyUnicode_New(writer->size, maxchar);
13886 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013887 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013888 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13889 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013890 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013891 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013892 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013893 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013894
13895#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013896}
13897
Victor Stinnerca9381e2015-09-22 00:58:32 +020013898int
13899_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13900 enum PyUnicode_Kind kind)
13901{
13902 Py_UCS4 maxchar;
13903
13904 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13905 assert(writer->kind < kind);
13906
13907 switch (kind)
13908 {
13909 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13910 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13911 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13912 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013913 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013914 }
13915
13916 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13917}
13918
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013919static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013920_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013921{
Victor Stinner2740e462016-09-06 16:58:36 -070013922 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013923 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13924 return -1;
13925 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13926 writer->pos++;
13927 return 0;
13928}
13929
13930int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013931_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13932{
13933 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13934}
13935
13936int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013937_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13938{
13939 Py_UCS4 maxchar;
13940 Py_ssize_t len;
13941
13942 if (PyUnicode_READY(str) == -1)
13943 return -1;
13944 len = PyUnicode_GET_LENGTH(str);
13945 if (len == 0)
13946 return 0;
13947 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13948 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013949 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013950 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013951 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013952 Py_INCREF(str);
13953 writer->buffer = str;
13954 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013955 writer->pos += len;
13956 return 0;
13957 }
13958 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13959 return -1;
13960 }
13961 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13962 str, 0, len);
13963 writer->pos += len;
13964 return 0;
13965}
13966
Victor Stinnere215d962012-10-06 23:03:36 +020013967int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013968_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13969 Py_ssize_t start, Py_ssize_t end)
13970{
13971 Py_UCS4 maxchar;
13972 Py_ssize_t len;
13973
13974 if (PyUnicode_READY(str) == -1)
13975 return -1;
13976
13977 assert(0 <= start);
13978 assert(end <= PyUnicode_GET_LENGTH(str));
13979 assert(start <= end);
13980
13981 if (end == 0)
13982 return 0;
13983
13984 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13985 return _PyUnicodeWriter_WriteStr(writer, str);
13986
13987 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13988 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13989 else
13990 maxchar = writer->maxchar;
13991 len = end - start;
13992
13993 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13994 return -1;
13995
13996 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13997 str, start, len);
13998 writer->pos += len;
13999 return 0;
14000}
14001
14002int
Victor Stinner4a587072013-11-19 12:54:53 +010014003_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14004 const char *ascii, Py_ssize_t len)
14005{
14006 if (len == -1)
14007 len = strlen(ascii);
14008
Andy Lestere6be9b52020-02-11 20:28:35 -060014009 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014010
14011 if (writer->buffer == NULL && !writer->overallocate) {
14012 PyObject *str;
14013
14014 str = _PyUnicode_FromASCII(ascii, len);
14015 if (str == NULL)
14016 return -1;
14017
14018 writer->readonly = 1;
14019 writer->buffer = str;
14020 _PyUnicodeWriter_Update(writer);
14021 writer->pos += len;
14022 return 0;
14023 }
14024
14025 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14026 return -1;
14027
14028 switch (writer->kind)
14029 {
14030 case PyUnicode_1BYTE_KIND:
14031 {
14032 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14033 Py_UCS1 *data = writer->data;
14034
Christian Heimesf051e432016-09-13 20:22:02 +020014035 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014036 break;
14037 }
14038 case PyUnicode_2BYTE_KIND:
14039 {
14040 _PyUnicode_CONVERT_BYTES(
14041 Py_UCS1, Py_UCS2,
14042 ascii, ascii + len,
14043 (Py_UCS2 *)writer->data + writer->pos);
14044 break;
14045 }
14046 case PyUnicode_4BYTE_KIND:
14047 {
14048 _PyUnicode_CONVERT_BYTES(
14049 Py_UCS1, Py_UCS4,
14050 ascii, ascii + len,
14051 (Py_UCS4 *)writer->data + writer->pos);
14052 break;
14053 }
14054 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014055 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014056 }
14057
14058 writer->pos += len;
14059 return 0;
14060}
14061
14062int
14063_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14064 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014065{
14066 Py_UCS4 maxchar;
14067
Andy Lestere6be9b52020-02-11 20:28:35 -060014068 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014069 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14070 return -1;
14071 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14072 writer->pos += len;
14073 return 0;
14074}
14075
Victor Stinnerd3f08822012-05-29 12:57:52 +020014076PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014077_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014078{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014079 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014080
Victor Stinnerd3f08822012-05-29 12:57:52 +020014081 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014082 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014083 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014084 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014085
14086 str = writer->buffer;
14087 writer->buffer = NULL;
14088
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014089 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014090 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14091 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014092 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014093
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014094 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14095 PyObject *str2;
14096 str2 = resize_compact(str, writer->pos);
14097 if (str2 == NULL) {
14098 Py_DECREF(str);
14099 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014100 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014101 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014102 }
14103
Victor Stinner15a0bd32013-07-08 22:29:55 +020014104 assert(_PyUnicode_CheckConsistency(str, 1));
14105 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014106}
14107
Victor Stinnerd3f08822012-05-29 12:57:52 +020014108void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014109_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014110{
14111 Py_CLEAR(writer->buffer);
14112}
14113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014114#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014115
14116PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014117 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014118\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014119Return a formatted version of S, using substitutions from args and kwargs.\n\
14120The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014121
Eric Smith27bbca62010-11-04 17:06:58 +000014122PyDoc_STRVAR(format_map__doc__,
14123 "S.format_map(mapping) -> str\n\
14124\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014125Return a formatted version of S, using substitutions from mapping.\n\
14126The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014127
INADA Naoki3ae20562017-01-16 20:41:20 +090014128/*[clinic input]
14129str.__format__ as unicode___format__
14130
14131 format_spec: unicode
14132 /
14133
14134Return a formatted version of the string as described by format_spec.
14135[clinic start generated code]*/
14136
Eric Smith4a7d76d2008-05-30 18:10:19 +000014137static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014138unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014139/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014140{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014141 _PyUnicodeWriter writer;
14142 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014143
Victor Stinnerd3f08822012-05-29 12:57:52 +020014144 if (PyUnicode_READY(self) == -1)
14145 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014146 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014147 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14148 self, format_spec, 0,
14149 PyUnicode_GET_LENGTH(format_spec));
14150 if (ret == -1) {
14151 _PyUnicodeWriter_Dealloc(&writer);
14152 return NULL;
14153 }
14154 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014155}
14156
INADA Naoki3ae20562017-01-16 20:41:20 +090014157/*[clinic input]
14158str.__sizeof__ as unicode_sizeof
14159
14160Return the size of the string in memory, in bytes.
14161[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014162
14163static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014164unicode_sizeof_impl(PyObject *self)
14165/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014166{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014167 Py_ssize_t size;
14168
14169 /* If it's a compact object, account for base structure +
14170 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014171 if (PyUnicode_IS_COMPACT_ASCII(self))
14172 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14173 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014174 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014175 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014176 else {
14177 /* If it is a two-block object, account for base object, and
14178 for character block if present. */
14179 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014180 if (_PyUnicode_DATA_ANY(self))
14181 size += (PyUnicode_GET_LENGTH(self) + 1) *
14182 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014183 }
14184 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014185 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014186 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14187 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14188 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14189 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014190
14191 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014192}
14193
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014194static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014195unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014196{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014197 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014198 if (!copy)
14199 return NULL;
14200 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014201}
14202
Guido van Rossumd57fd912000-03-10 22:53:23 +000014203static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014204 UNICODE_ENCODE_METHODDEF
14205 UNICODE_REPLACE_METHODDEF
14206 UNICODE_SPLIT_METHODDEF
14207 UNICODE_RSPLIT_METHODDEF
14208 UNICODE_JOIN_METHODDEF
14209 UNICODE_CAPITALIZE_METHODDEF
14210 UNICODE_CASEFOLD_METHODDEF
14211 UNICODE_TITLE_METHODDEF
14212 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014213 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014214 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014215 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014216 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014217 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014218 UNICODE_LJUST_METHODDEF
14219 UNICODE_LOWER_METHODDEF
14220 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014221 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14222 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014223 UNICODE_RJUST_METHODDEF
14224 UNICODE_RSTRIP_METHODDEF
14225 UNICODE_RPARTITION_METHODDEF
14226 UNICODE_SPLITLINES_METHODDEF
14227 UNICODE_STRIP_METHODDEF
14228 UNICODE_SWAPCASE_METHODDEF
14229 UNICODE_TRANSLATE_METHODDEF
14230 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014231 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14232 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014233 UNICODE_REMOVEPREFIX_METHODDEF
14234 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014235 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014236 UNICODE_ISLOWER_METHODDEF
14237 UNICODE_ISUPPER_METHODDEF
14238 UNICODE_ISTITLE_METHODDEF
14239 UNICODE_ISSPACE_METHODDEF
14240 UNICODE_ISDECIMAL_METHODDEF
14241 UNICODE_ISDIGIT_METHODDEF
14242 UNICODE_ISNUMERIC_METHODDEF
14243 UNICODE_ISALPHA_METHODDEF
14244 UNICODE_ISALNUM_METHODDEF
14245 UNICODE_ISIDENTIFIER_METHODDEF
14246 UNICODE_ISPRINTABLE_METHODDEF
14247 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014248 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014249 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014250 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014251 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014252 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014253#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014254 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014255 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014256#endif
14257
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014258 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014259 {NULL, NULL}
14260};
14261
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014262static PyObject *
14263unicode_mod(PyObject *v, PyObject *w)
14264{
Brian Curtindfc80e32011-08-10 20:28:54 -050014265 if (!PyUnicode_Check(v))
14266 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014267 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014268}
14269
14270static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014271 0, /*nb_add*/
14272 0, /*nb_subtract*/
14273 0, /*nb_multiply*/
14274 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014275};
14276
Guido van Rossumd57fd912000-03-10 22:53:23 +000014277static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014278 (lenfunc) unicode_length, /* sq_length */
14279 PyUnicode_Concat, /* sq_concat */
14280 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14281 (ssizeargfunc) unicode_getitem, /* sq_item */
14282 0, /* sq_slice */
14283 0, /* sq_ass_item */
14284 0, /* sq_ass_slice */
14285 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014286};
14287
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014288static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014289unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014291 if (PyUnicode_READY(self) == -1)
14292 return NULL;
14293
Victor Stinnera15e2602020-04-08 02:01:56 +020014294 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014295 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014296 if (i == -1 && PyErr_Occurred())
14297 return NULL;
14298 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014299 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014300 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014301 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014302 Py_ssize_t start, stop, step, slicelength, i;
14303 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014304 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014305 const void *src_data;
14306 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014307 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014308 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014309
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014310 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014311 return NULL;
14312 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014313 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14314 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014315
14316 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014317 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014318 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014319 slicelength == PyUnicode_GET_LENGTH(self)) {
14320 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014321 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014322 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014323 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014324 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014325 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014326 src_kind = PyUnicode_KIND(self);
14327 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014328 if (!PyUnicode_IS_ASCII(self)) {
14329 kind_limit = kind_maxchar_limit(src_kind);
14330 max_char = 0;
14331 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14332 ch = PyUnicode_READ(src_kind, src_data, cur);
14333 if (ch > max_char) {
14334 max_char = ch;
14335 if (max_char >= kind_limit)
14336 break;
14337 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014338 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014339 }
Victor Stinner55c99112011-10-13 01:17:06 +020014340 else
14341 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014342 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014343 if (result == NULL)
14344 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014345 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014346 dest_data = PyUnicode_DATA(result);
14347
14348 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014349 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14350 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014351 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014352 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014353 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014354 } else {
14355 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14356 return NULL;
14357 }
14358}
14359
14360static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014361 (lenfunc)unicode_length, /* mp_length */
14362 (binaryfunc)unicode_subscript, /* mp_subscript */
14363 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014364};
14365
Guido van Rossumd57fd912000-03-10 22:53:23 +000014366
Guido van Rossumd57fd912000-03-10 22:53:23 +000014367/* Helpers for PyUnicode_Format() */
14368
Victor Stinnera47082312012-10-04 02:19:54 +020014369struct unicode_formatter_t {
14370 PyObject *args;
14371 int args_owned;
14372 Py_ssize_t arglen, argidx;
14373 PyObject *dict;
14374
14375 enum PyUnicode_Kind fmtkind;
14376 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014377 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014378 PyObject *fmtstr;
14379
14380 _PyUnicodeWriter writer;
14381};
14382
14383struct unicode_format_arg_t {
14384 Py_UCS4 ch;
14385 int flags;
14386 Py_ssize_t width;
14387 int prec;
14388 int sign;
14389};
14390
Guido van Rossumd57fd912000-03-10 22:53:23 +000014391static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014392unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014393{
Victor Stinnera47082312012-10-04 02:19:54 +020014394 Py_ssize_t argidx = ctx->argidx;
14395
14396 if (argidx < ctx->arglen) {
14397 ctx->argidx++;
14398 if (ctx->arglen < 0)
14399 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014400 else
Victor Stinnera47082312012-10-04 02:19:54 +020014401 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014402 }
14403 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014404 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014405 return NULL;
14406}
14407
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014408/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014409
Victor Stinnera47082312012-10-04 02:19:54 +020014410/* Format a float into the writer if the writer is not NULL, or into *p_output
14411 otherwise.
14412
14413 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014414static int
Victor Stinnera47082312012-10-04 02:19:54 +020014415formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14416 PyObject **p_output,
14417 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014418{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014419 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014420 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014421 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014422 int prec;
14423 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014424
Guido van Rossumd57fd912000-03-10 22:53:23 +000014425 x = PyFloat_AsDouble(v);
14426 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014427 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014428
Victor Stinnera47082312012-10-04 02:19:54 +020014429 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014430 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014431 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014432
Victor Stinnera47082312012-10-04 02:19:54 +020014433 if (arg->flags & F_ALT)
14434 dtoa_flags = Py_DTSF_ALT;
14435 else
14436 dtoa_flags = 0;
14437 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014438 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014439 return -1;
14440 len = strlen(p);
14441 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014442 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014443 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014444 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014445 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014446 }
14447 else
14448 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014449 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014450 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014451}
14452
Victor Stinnerd0880d52012-04-27 23:40:13 +020014453/* formatlong() emulates the format codes d, u, o, x and X, and
14454 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14455 * Python's regular ints.
14456 * Return value: a new PyUnicodeObject*, or NULL if error.
14457 * The output string is of the form
14458 * "-"? ("0x" | "0X")? digit+
14459 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14460 * set in flags. The case of hex digits will be correct,
14461 * There will be at least prec digits, zero-filled on the left if
14462 * necessary to get that many.
14463 * val object to be converted
14464 * flags bitmask of format flags; only F_ALT is looked at
14465 * prec minimum number of digits; 0-fill on left if needed
14466 * type a character in [duoxX]; u acts the same as d
14467 *
14468 * CAUTION: o, x and X conversions on regular ints can never
14469 * produce a '-' sign, but can for Python's unbounded ints.
14470 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014471PyObject *
14472_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014473{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014474 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014475 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014476 Py_ssize_t i;
14477 int sign; /* 1 if '-', else 0 */
14478 int len; /* number of characters */
14479 Py_ssize_t llen;
14480 int numdigits; /* len == numnondigits + numdigits */
14481 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014482
Victor Stinnerd0880d52012-04-27 23:40:13 +020014483 /* Avoid exceeding SSIZE_T_MAX */
14484 if (prec > INT_MAX-3) {
14485 PyErr_SetString(PyExc_OverflowError,
14486 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014487 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014488 }
14489
14490 assert(PyLong_Check(val));
14491
14492 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014493 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014494 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014495 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014496 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014497 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014498 /* int and int subclasses should print numerically when a numeric */
14499 /* format code is used (see issue18780) */
14500 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014501 break;
14502 case 'o':
14503 numnondigits = 2;
14504 result = PyNumber_ToBase(val, 8);
14505 break;
14506 case 'x':
14507 case 'X':
14508 numnondigits = 2;
14509 result = PyNumber_ToBase(val, 16);
14510 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014511 }
14512 if (!result)
14513 return NULL;
14514
14515 assert(unicode_modifiable(result));
14516 assert(PyUnicode_IS_READY(result));
14517 assert(PyUnicode_IS_ASCII(result));
14518
14519 /* To modify the string in-place, there can only be one reference. */
14520 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014521 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014522 PyErr_BadInternalCall();
14523 return NULL;
14524 }
14525 buf = PyUnicode_DATA(result);
14526 llen = PyUnicode_GET_LENGTH(result);
14527 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014528 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014529 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014530 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014531 return NULL;
14532 }
14533 len = (int)llen;
14534 sign = buf[0] == '-';
14535 numnondigits += sign;
14536 numdigits = len - numnondigits;
14537 assert(numdigits > 0);
14538
14539 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014540 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014541 (type == 'o' || type == 'x' || type == 'X'))) {
14542 assert(buf[sign] == '0');
14543 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14544 buf[sign+1] == 'o');
14545 numnondigits -= 2;
14546 buf += 2;
14547 len -= 2;
14548 if (sign)
14549 buf[0] = '-';
14550 assert(len == numnondigits + numdigits);
14551 assert(numdigits > 0);
14552 }
14553
14554 /* Fill with leading zeroes to meet minimum width. */
14555 if (prec > numdigits) {
14556 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14557 numnondigits + prec);
14558 char *b1;
14559 if (!r1) {
14560 Py_DECREF(result);
14561 return NULL;
14562 }
14563 b1 = PyBytes_AS_STRING(r1);
14564 for (i = 0; i < numnondigits; ++i)
14565 *b1++ = *buf++;
14566 for (i = 0; i < prec - numdigits; i++)
14567 *b1++ = '0';
14568 for (i = 0; i < numdigits; i++)
14569 *b1++ = *buf++;
14570 *b1 = '\0';
14571 Py_DECREF(result);
14572 result = r1;
14573 buf = PyBytes_AS_STRING(result);
14574 len = numnondigits + prec;
14575 }
14576
14577 /* Fix up case for hex conversions. */
14578 if (type == 'X') {
14579 /* Need to convert all lower case letters to upper case.
14580 and need to convert 0x to 0X (and -0x to -0X). */
14581 for (i = 0; i < len; i++)
14582 if (buf[i] >= 'a' && buf[i] <= 'x')
14583 buf[i] -= 'a'-'A';
14584 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014585 if (!PyUnicode_Check(result)
14586 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014587 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014588 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014589 Py_DECREF(result);
14590 result = unicode;
14591 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014592 else if (len != PyUnicode_GET_LENGTH(result)) {
14593 if (PyUnicode_Resize(&result, len) < 0)
14594 Py_CLEAR(result);
14595 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014596 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014597}
14598
Ethan Furmandf3ed242014-01-05 06:50:30 -080014599/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014600 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014601 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014602 * -1 and raise an exception on error */
14603static int
Victor Stinnera47082312012-10-04 02:19:54 +020014604mainformatlong(PyObject *v,
14605 struct unicode_format_arg_t *arg,
14606 PyObject **p_output,
14607 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014608{
14609 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014610 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014611
14612 if (!PyNumber_Check(v))
14613 goto wrongtype;
14614
Ethan Furman9ab74802014-03-21 06:38:46 -070014615 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014616 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014617 if (type == 'o' || type == 'x' || type == 'X') {
14618 iobj = PyNumber_Index(v);
14619 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014620 if (PyErr_ExceptionMatches(PyExc_TypeError))
14621 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014622 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014623 }
14624 }
14625 else {
14626 iobj = PyNumber_Long(v);
14627 if (iobj == NULL ) {
14628 if (PyErr_ExceptionMatches(PyExc_TypeError))
14629 goto wrongtype;
14630 return -1;
14631 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014632 }
14633 assert(PyLong_Check(iobj));
14634 }
14635 else {
14636 iobj = v;
14637 Py_INCREF(iobj);
14638 }
14639
14640 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014641 && arg->width == -1 && arg->prec == -1
14642 && !(arg->flags & (F_SIGN | F_BLANK))
14643 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014644 {
14645 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014646 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014647 int base;
14648
Victor Stinnera47082312012-10-04 02:19:54 +020014649 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014650 {
14651 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014652 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014653 case 'd':
14654 case 'i':
14655 case 'u':
14656 base = 10;
14657 break;
14658 case 'o':
14659 base = 8;
14660 break;
14661 case 'x':
14662 case 'X':
14663 base = 16;
14664 break;
14665 }
14666
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014667 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14668 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014669 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014670 }
14671 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014672 return 1;
14673 }
14674
Ethan Furmanb95b5612015-01-23 20:05:18 -080014675 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014676 Py_DECREF(iobj);
14677 if (res == NULL)
14678 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014679 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014680 return 0;
14681
14682wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014683 switch(type)
14684 {
14685 case 'o':
14686 case 'x':
14687 case 'X':
14688 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014689 "%%%c format: an integer is required, "
14690 "not %.200s",
14691 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014692 break;
14693 default:
14694 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014695 "%%%c format: a number is required, "
14696 "not %.200s",
14697 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014698 break;
14699 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014700 return -1;
14701}
14702
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014703static Py_UCS4
14704formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014705{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014706 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014707 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014708 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014709 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014710 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014711 goto onError;
14712 }
14713 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014714 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014715 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014716 /* make sure number is a type of integer */
14717 if (!PyLong_Check(v)) {
14718 iobj = PyNumber_Index(v);
14719 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014720 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014721 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014722 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014723 Py_DECREF(iobj);
14724 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014725 else {
14726 x = PyLong_AsLong(v);
14727 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014728 if (x == -1 && PyErr_Occurred())
14729 goto onError;
14730
Victor Stinner8faf8212011-12-08 22:14:11 +010014731 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014732 PyErr_SetString(PyExc_OverflowError,
14733 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014734 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014735 }
14736
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014737 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014738 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014739
Benjamin Peterson29060642009-01-31 22:14:21 +000014740 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014741 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014742 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014743 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014744}
14745
Victor Stinnera47082312012-10-04 02:19:54 +020014746/* Parse options of an argument: flags, width, precision.
14747 Handle also "%(name)" syntax.
14748
14749 Return 0 if the argument has been formatted into arg->str.
14750 Return 1 if the argument has been written into ctx->writer,
14751 Raise an exception and return -1 on error. */
14752static int
14753unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14754 struct unicode_format_arg_t *arg)
14755{
14756#define FORMAT_READ(ctx) \
14757 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14758
14759 PyObject *v;
14760
Victor Stinnera47082312012-10-04 02:19:54 +020014761 if (arg->ch == '(') {
14762 /* Get argument value from a dictionary. Example: "%(name)s". */
14763 Py_ssize_t keystart;
14764 Py_ssize_t keylen;
14765 PyObject *key;
14766 int pcount = 1;
14767
14768 if (ctx->dict == NULL) {
14769 PyErr_SetString(PyExc_TypeError,
14770 "format requires a mapping");
14771 return -1;
14772 }
14773 ++ctx->fmtpos;
14774 --ctx->fmtcnt;
14775 keystart = ctx->fmtpos;
14776 /* Skip over balanced parentheses */
14777 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14778 arg->ch = FORMAT_READ(ctx);
14779 if (arg->ch == ')')
14780 --pcount;
14781 else if (arg->ch == '(')
14782 ++pcount;
14783 ctx->fmtpos++;
14784 }
14785 keylen = ctx->fmtpos - keystart - 1;
14786 if (ctx->fmtcnt < 0 || pcount > 0) {
14787 PyErr_SetString(PyExc_ValueError,
14788 "incomplete format key");
14789 return -1;
14790 }
14791 key = PyUnicode_Substring(ctx->fmtstr,
14792 keystart, keystart + keylen);
14793 if (key == NULL)
14794 return -1;
14795 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014796 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014797 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014798 }
14799 ctx->args = PyObject_GetItem(ctx->dict, key);
14800 Py_DECREF(key);
14801 if (ctx->args == NULL)
14802 return -1;
14803 ctx->args_owned = 1;
14804 ctx->arglen = -1;
14805 ctx->argidx = -2;
14806 }
14807
14808 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014809 while (--ctx->fmtcnt >= 0) {
14810 arg->ch = FORMAT_READ(ctx);
14811 ctx->fmtpos++;
14812 switch (arg->ch) {
14813 case '-': arg->flags |= F_LJUST; continue;
14814 case '+': arg->flags |= F_SIGN; continue;
14815 case ' ': arg->flags |= F_BLANK; continue;
14816 case '#': arg->flags |= F_ALT; continue;
14817 case '0': arg->flags |= F_ZERO; continue;
14818 }
14819 break;
14820 }
14821
14822 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014823 if (arg->ch == '*') {
14824 v = unicode_format_getnextarg(ctx);
14825 if (v == NULL)
14826 return -1;
14827 if (!PyLong_Check(v)) {
14828 PyErr_SetString(PyExc_TypeError,
14829 "* wants int");
14830 return -1;
14831 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014832 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014833 if (arg->width == -1 && PyErr_Occurred())
14834 return -1;
14835 if (arg->width < 0) {
14836 arg->flags |= F_LJUST;
14837 arg->width = -arg->width;
14838 }
14839 if (--ctx->fmtcnt >= 0) {
14840 arg->ch = FORMAT_READ(ctx);
14841 ctx->fmtpos++;
14842 }
14843 }
14844 else if (arg->ch >= '0' && arg->ch <= '9') {
14845 arg->width = arg->ch - '0';
14846 while (--ctx->fmtcnt >= 0) {
14847 arg->ch = FORMAT_READ(ctx);
14848 ctx->fmtpos++;
14849 if (arg->ch < '0' || arg->ch > '9')
14850 break;
14851 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14852 mixing signed and unsigned comparison. Since arg->ch is between
14853 '0' and '9', casting to int is safe. */
14854 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14855 PyErr_SetString(PyExc_ValueError,
14856 "width too big");
14857 return -1;
14858 }
14859 arg->width = arg->width*10 + (arg->ch - '0');
14860 }
14861 }
14862
14863 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014864 if (arg->ch == '.') {
14865 arg->prec = 0;
14866 if (--ctx->fmtcnt >= 0) {
14867 arg->ch = FORMAT_READ(ctx);
14868 ctx->fmtpos++;
14869 }
14870 if (arg->ch == '*') {
14871 v = unicode_format_getnextarg(ctx);
14872 if (v == NULL)
14873 return -1;
14874 if (!PyLong_Check(v)) {
14875 PyErr_SetString(PyExc_TypeError,
14876 "* wants int");
14877 return -1;
14878 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014879 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014880 if (arg->prec == -1 && PyErr_Occurred())
14881 return -1;
14882 if (arg->prec < 0)
14883 arg->prec = 0;
14884 if (--ctx->fmtcnt >= 0) {
14885 arg->ch = FORMAT_READ(ctx);
14886 ctx->fmtpos++;
14887 }
14888 }
14889 else if (arg->ch >= '0' && arg->ch <= '9') {
14890 arg->prec = arg->ch - '0';
14891 while (--ctx->fmtcnt >= 0) {
14892 arg->ch = FORMAT_READ(ctx);
14893 ctx->fmtpos++;
14894 if (arg->ch < '0' || arg->ch > '9')
14895 break;
14896 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14897 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014898 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014899 return -1;
14900 }
14901 arg->prec = arg->prec*10 + (arg->ch - '0');
14902 }
14903 }
14904 }
14905
14906 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14907 if (ctx->fmtcnt >= 0) {
14908 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14909 if (--ctx->fmtcnt >= 0) {
14910 arg->ch = FORMAT_READ(ctx);
14911 ctx->fmtpos++;
14912 }
14913 }
14914 }
14915 if (ctx->fmtcnt < 0) {
14916 PyErr_SetString(PyExc_ValueError,
14917 "incomplete format");
14918 return -1;
14919 }
14920 return 0;
14921
14922#undef FORMAT_READ
14923}
14924
14925/* Format one argument. Supported conversion specifiers:
14926
14927 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014928 - "i", "d", "u": int or float
14929 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014930 - "e", "E", "f", "F", "g", "G": float
14931 - "c": int or str (1 character)
14932
Victor Stinner8dbd4212012-12-04 09:30:24 +010014933 When possible, the output is written directly into the Unicode writer
14934 (ctx->writer). A string is created when padding is required.
14935
Victor Stinnera47082312012-10-04 02:19:54 +020014936 Return 0 if the argument has been formatted into *p_str,
14937 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014938 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014939static int
14940unicode_format_arg_format(struct unicode_formatter_t *ctx,
14941 struct unicode_format_arg_t *arg,
14942 PyObject **p_str)
14943{
14944 PyObject *v;
14945 _PyUnicodeWriter *writer = &ctx->writer;
14946
14947 if (ctx->fmtcnt == 0)
14948 ctx->writer.overallocate = 0;
14949
Victor Stinnera47082312012-10-04 02:19:54 +020014950 v = unicode_format_getnextarg(ctx);
14951 if (v == NULL)
14952 return -1;
14953
Victor Stinnera47082312012-10-04 02:19:54 +020014954
14955 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014956 case 's':
14957 case 'r':
14958 case 'a':
14959 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14960 /* Fast path */
14961 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14962 return -1;
14963 return 1;
14964 }
14965
14966 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14967 *p_str = v;
14968 Py_INCREF(*p_str);
14969 }
14970 else {
14971 if (arg->ch == 's')
14972 *p_str = PyObject_Str(v);
14973 else if (arg->ch == 'r')
14974 *p_str = PyObject_Repr(v);
14975 else
14976 *p_str = PyObject_ASCII(v);
14977 }
14978 break;
14979
14980 case 'i':
14981 case 'd':
14982 case 'u':
14983 case 'o':
14984 case 'x':
14985 case 'X':
14986 {
14987 int ret = mainformatlong(v, arg, p_str, writer);
14988 if (ret != 0)
14989 return ret;
14990 arg->sign = 1;
14991 break;
14992 }
14993
14994 case 'e':
14995 case 'E':
14996 case 'f':
14997 case 'F':
14998 case 'g':
14999 case 'G':
15000 if (arg->width == -1 && arg->prec == -1
15001 && !(arg->flags & (F_SIGN | F_BLANK)))
15002 {
15003 /* Fast path */
15004 if (formatfloat(v, arg, NULL, writer) == -1)
15005 return -1;
15006 return 1;
15007 }
15008
15009 arg->sign = 1;
15010 if (formatfloat(v, arg, p_str, NULL) == -1)
15011 return -1;
15012 break;
15013
15014 case 'c':
15015 {
15016 Py_UCS4 ch = formatchar(v);
15017 if (ch == (Py_UCS4) -1)
15018 return -1;
15019 if (arg->width == -1 && arg->prec == -1) {
15020 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015021 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015022 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015023 return 1;
15024 }
15025 *p_str = PyUnicode_FromOrdinal(ch);
15026 break;
15027 }
15028
15029 default:
15030 PyErr_Format(PyExc_ValueError,
15031 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015032 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015033 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15034 (int)arg->ch,
15035 ctx->fmtpos - 1);
15036 return -1;
15037 }
15038 if (*p_str == NULL)
15039 return -1;
15040 assert (PyUnicode_Check(*p_str));
15041 return 0;
15042}
15043
15044static int
15045unicode_format_arg_output(struct unicode_formatter_t *ctx,
15046 struct unicode_format_arg_t *arg,
15047 PyObject *str)
15048{
15049 Py_ssize_t len;
15050 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015051 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015052 Py_ssize_t pindex;
15053 Py_UCS4 signchar;
15054 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015055 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015056 Py_ssize_t sublen;
15057 _PyUnicodeWriter *writer = &ctx->writer;
15058 Py_UCS4 fill;
15059
15060 fill = ' ';
15061 if (arg->sign && arg->flags & F_ZERO)
15062 fill = '0';
15063
15064 if (PyUnicode_READY(str) == -1)
15065 return -1;
15066
15067 len = PyUnicode_GET_LENGTH(str);
15068 if ((arg->width == -1 || arg->width <= len)
15069 && (arg->prec == -1 || arg->prec >= len)
15070 && !(arg->flags & (F_SIGN | F_BLANK)))
15071 {
15072 /* Fast path */
15073 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15074 return -1;
15075 return 0;
15076 }
15077
15078 /* Truncate the string for "s", "r" and "a" formats
15079 if the precision is set */
15080 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15081 if (arg->prec >= 0 && len > arg->prec)
15082 len = arg->prec;
15083 }
15084
15085 /* Adjust sign and width */
15086 kind = PyUnicode_KIND(str);
15087 pbuf = PyUnicode_DATA(str);
15088 pindex = 0;
15089 signchar = '\0';
15090 if (arg->sign) {
15091 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15092 if (ch == '-' || ch == '+') {
15093 signchar = ch;
15094 len--;
15095 pindex++;
15096 }
15097 else if (arg->flags & F_SIGN)
15098 signchar = '+';
15099 else if (arg->flags & F_BLANK)
15100 signchar = ' ';
15101 else
15102 arg->sign = 0;
15103 }
15104 if (arg->width < len)
15105 arg->width = len;
15106
15107 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015108 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015109 if (!(arg->flags & F_LJUST)) {
15110 if (arg->sign) {
15111 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015112 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015113 }
15114 else {
15115 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015116 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015117 }
15118 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015119 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15120 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015121 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015122 }
15123
Victor Stinnera47082312012-10-04 02:19:54 +020015124 buflen = arg->width;
15125 if (arg->sign && len == arg->width)
15126 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015127 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015128 return -1;
15129
15130 /* Write the sign if needed */
15131 if (arg->sign) {
15132 if (fill != ' ') {
15133 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15134 writer->pos += 1;
15135 }
15136 if (arg->width > len)
15137 arg->width--;
15138 }
15139
15140 /* Write the numeric prefix for "x", "X" and "o" formats
15141 if the alternate form is used.
15142 For example, write "0x" for the "%#x" format. */
15143 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15144 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15145 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15146 if (fill != ' ') {
15147 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15148 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15149 writer->pos += 2;
15150 pindex += 2;
15151 }
15152 arg->width -= 2;
15153 if (arg->width < 0)
15154 arg->width = 0;
15155 len -= 2;
15156 }
15157
15158 /* Pad left with the fill character if needed */
15159 if (arg->width > len && !(arg->flags & F_LJUST)) {
15160 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015161 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015162 writer->pos += sublen;
15163 arg->width = len;
15164 }
15165
15166 /* If padding with spaces: write sign if needed and/or numeric prefix if
15167 the alternate form is used */
15168 if (fill == ' ') {
15169 if (arg->sign) {
15170 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15171 writer->pos += 1;
15172 }
15173 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15174 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15175 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15176 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15177 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15178 writer->pos += 2;
15179 pindex += 2;
15180 }
15181 }
15182
15183 /* Write characters */
15184 if (len) {
15185 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15186 str, pindex, len);
15187 writer->pos += len;
15188 }
15189
15190 /* Pad right with the fill character if needed */
15191 if (arg->width > len) {
15192 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015193 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015194 writer->pos += sublen;
15195 }
15196 return 0;
15197}
15198
15199/* Helper of PyUnicode_Format(): format one arg.
15200 Return 0 on success, raise an exception and return -1 on error. */
15201static int
15202unicode_format_arg(struct unicode_formatter_t *ctx)
15203{
15204 struct unicode_format_arg_t arg;
15205 PyObject *str;
15206 int ret;
15207
Victor Stinner8dbd4212012-12-04 09:30:24 +010015208 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015209 if (arg.ch == '%') {
15210 ctx->fmtpos++;
15211 ctx->fmtcnt--;
15212 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15213 return -1;
15214 return 0;
15215 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015216 arg.flags = 0;
15217 arg.width = -1;
15218 arg.prec = -1;
15219 arg.sign = 0;
15220 str = NULL;
15221
Victor Stinnera47082312012-10-04 02:19:54 +020015222 ret = unicode_format_arg_parse(ctx, &arg);
15223 if (ret == -1)
15224 return -1;
15225
15226 ret = unicode_format_arg_format(ctx, &arg, &str);
15227 if (ret == -1)
15228 return -1;
15229
15230 if (ret != 1) {
15231 ret = unicode_format_arg_output(ctx, &arg, str);
15232 Py_DECREF(str);
15233 if (ret == -1)
15234 return -1;
15235 }
15236
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015237 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015238 PyErr_SetString(PyExc_TypeError,
15239 "not all arguments converted during string formatting");
15240 return -1;
15241 }
15242 return 0;
15243}
15244
Alexander Belopolsky40018472011-02-26 01:02:56 +000015245PyObject *
15246PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015247{
Victor Stinnera47082312012-10-04 02:19:54 +020015248 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015249
Guido van Rossumd57fd912000-03-10 22:53:23 +000015250 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015251 PyErr_BadInternalCall();
15252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015253 }
Victor Stinnera47082312012-10-04 02:19:54 +020015254
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015255 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015256 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015257
15258 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015259 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15260 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15261 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15262 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015263
Victor Stinner8f674cc2013-04-17 23:02:17 +020015264 _PyUnicodeWriter_Init(&ctx.writer);
15265 ctx.writer.min_length = ctx.fmtcnt + 100;
15266 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015267
Guido van Rossumd57fd912000-03-10 22:53:23 +000015268 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015269 ctx.arglen = PyTuple_Size(args);
15270 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015271 }
15272 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015273 ctx.arglen = -1;
15274 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015275 }
Victor Stinnera47082312012-10-04 02:19:54 +020015276 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015277 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015278 ctx.dict = args;
15279 else
15280 ctx.dict = NULL;
15281 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015282
Victor Stinnera47082312012-10-04 02:19:54 +020015283 while (--ctx.fmtcnt >= 0) {
15284 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015285 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015286
15287 nonfmtpos = ctx.fmtpos++;
15288 while (ctx.fmtcnt >= 0 &&
15289 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15290 ctx.fmtpos++;
15291 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015292 }
Victor Stinnera47082312012-10-04 02:19:54 +020015293 if (ctx.fmtcnt < 0) {
15294 ctx.fmtpos--;
15295 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015296 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015297
Victor Stinnercfc4c132013-04-03 01:48:39 +020015298 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15299 nonfmtpos, ctx.fmtpos) < 0)
15300 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015301 }
15302 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015303 ctx.fmtpos++;
15304 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015305 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015306 }
15307 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015308
Victor Stinnera47082312012-10-04 02:19:54 +020015309 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015310 PyErr_SetString(PyExc_TypeError,
15311 "not all arguments converted during string formatting");
15312 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015313 }
15314
Victor Stinnera47082312012-10-04 02:19:54 +020015315 if (ctx.args_owned) {
15316 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015317 }
Victor Stinnera47082312012-10-04 02:19:54 +020015318 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015319
Benjamin Peterson29060642009-01-31 22:14:21 +000015320 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015321 _PyUnicodeWriter_Dealloc(&ctx.writer);
15322 if (ctx.args_owned) {
15323 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015324 }
15325 return NULL;
15326}
15327
Jeremy Hylton938ace62002-07-17 16:30:39 +000015328static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015329unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15330
Tim Peters6d6c1a32001-08-02 04:15:00 +000015331static PyObject *
15332unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15333{
Benjamin Peterson29060642009-01-31 22:14:21 +000015334 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 static char *kwlist[] = {"object", "encoding", "errors", 0};
15336 char *encoding = NULL;
15337 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015338
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 if (type != &PyUnicode_Type)
15340 return unicode_subtype_new(type, args, kwds);
15341 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015342 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 return NULL;
15344 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015345 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 if (encoding == NULL && errors == NULL)
15347 return PyObject_Str(x);
15348 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015349 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015350}
15351
Guido van Rossume023fe02001-08-30 03:12:59 +000015352static PyObject *
15353unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15354{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015355 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015356 Py_ssize_t length, char_size;
15357 int share_wstr, share_utf8;
15358 unsigned int kind;
15359 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015360
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015362
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015363 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015364 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015366 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015367 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015368 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015369 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015370 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015371
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015372 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015373 if (self == NULL) {
15374 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015375 return NULL;
15376 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015377 kind = PyUnicode_KIND(unicode);
15378 length = PyUnicode_GET_LENGTH(unicode);
15379
15380 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015381#ifdef Py_DEBUG
15382 _PyUnicode_HASH(self) = -1;
15383#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015384 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015385#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015386 _PyUnicode_STATE(self).interned = 0;
15387 _PyUnicode_STATE(self).kind = kind;
15388 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015389 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015390 _PyUnicode_STATE(self).ready = 1;
15391 _PyUnicode_WSTR(self) = NULL;
15392 _PyUnicode_UTF8_LENGTH(self) = 0;
15393 _PyUnicode_UTF8(self) = NULL;
15394 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015395 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015396
15397 share_utf8 = 0;
15398 share_wstr = 0;
15399 if (kind == PyUnicode_1BYTE_KIND) {
15400 char_size = 1;
15401 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15402 share_utf8 = 1;
15403 }
15404 else if (kind == PyUnicode_2BYTE_KIND) {
15405 char_size = 2;
15406 if (sizeof(wchar_t) == 2)
15407 share_wstr = 1;
15408 }
15409 else {
15410 assert(kind == PyUnicode_4BYTE_KIND);
15411 char_size = 4;
15412 if (sizeof(wchar_t) == 4)
15413 share_wstr = 1;
15414 }
15415
15416 /* Ensure we won't overflow the length. */
15417 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15418 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015419 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015420 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015421 data = PyObject_MALLOC((length + 1) * char_size);
15422 if (data == NULL) {
15423 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015424 goto onError;
15425 }
15426
Victor Stinnerc3c74152011-10-02 20:39:55 +020015427 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015428 if (share_utf8) {
15429 _PyUnicode_UTF8_LENGTH(self) = length;
15430 _PyUnicode_UTF8(self) = data;
15431 }
15432 if (share_wstr) {
15433 _PyUnicode_WSTR_LENGTH(self) = length;
15434 _PyUnicode_WSTR(self) = (wchar_t *)data;
15435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015436
Christian Heimesf051e432016-09-13 20:22:02 +020015437 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015438 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015439 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015440#ifdef Py_DEBUG
15441 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15442#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015443 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015444 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015445
15446onError:
15447 Py_DECREF(unicode);
15448 Py_DECREF(self);
15449 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015450}
15451
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015452PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015453"str(object='') -> str\n\
15454str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015455\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015456Create a new string object from the given object. If encoding or\n\
15457errors is specified, then the object must expose a data buffer\n\
15458that will be decoded using the given encoding and error handler.\n\
15459Otherwise, returns the result of object.__str__() (if defined)\n\
15460or repr(object).\n\
15461encoding defaults to sys.getdefaultencoding().\n\
15462errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015463
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015464static PyObject *unicode_iter(PyObject *seq);
15465
Guido van Rossumd57fd912000-03-10 22:53:23 +000015466PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015467 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015468 "str", /* tp_name */
15469 sizeof(PyUnicodeObject), /* tp_basicsize */
15470 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015471 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015472 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015473 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015474 0, /* tp_getattr */
15475 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015476 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015477 unicode_repr, /* tp_repr */
15478 &unicode_as_number, /* tp_as_number */
15479 &unicode_as_sequence, /* tp_as_sequence */
15480 &unicode_as_mapping, /* tp_as_mapping */
15481 (hashfunc) unicode_hash, /* tp_hash*/
15482 0, /* tp_call*/
15483 (reprfunc) unicode_str, /* tp_str */
15484 PyObject_GenericGetAttr, /* tp_getattro */
15485 0, /* tp_setattro */
15486 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015487 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015488 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15489 unicode_doc, /* tp_doc */
15490 0, /* tp_traverse */
15491 0, /* tp_clear */
15492 PyUnicode_RichCompare, /* tp_richcompare */
15493 0, /* tp_weaklistoffset */
15494 unicode_iter, /* tp_iter */
15495 0, /* tp_iternext */
15496 unicode_methods, /* tp_methods */
15497 0, /* tp_members */
15498 0, /* tp_getset */
15499 &PyBaseObject_Type, /* tp_base */
15500 0, /* tp_dict */
15501 0, /* tp_descr_get */
15502 0, /* tp_descr_set */
15503 0, /* tp_dictoffset */
15504 0, /* tp_init */
15505 0, /* tp_alloc */
15506 unicode_new, /* tp_new */
15507 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015508};
15509
15510/* Initialize the Unicode implementation */
15511
Victor Stinner331a6a52019-05-27 16:39:22 +020015512PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015513_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015514{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015515 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015516 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015517 0x000A, /* LINE FEED */
15518 0x000D, /* CARRIAGE RETURN */
15519 0x001C, /* FILE SEPARATOR */
15520 0x001D, /* GROUP SEPARATOR */
15521 0x001E, /* RECORD SEPARATOR */
15522 0x0085, /* NEXT LINE */
15523 0x2028, /* LINE SEPARATOR */
15524 0x2029, /* PARAGRAPH SEPARATOR */
15525 };
15526
Fred Drakee4315f52000-05-09 19:53:39 +000015527 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015528 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015529 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015530 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015531 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015532 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015533
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015534 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015535 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015536 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015537
15538 /* initialize the linebreak bloom filter */
15539 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015540 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015541 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015542
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015543 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015544 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015545 }
15546 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015547 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015548 }
15549 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015550 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015551 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015552 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015553}
15554
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015555
Walter Dörwald16807132007-05-25 13:52:07 +000015556void
15557PyUnicode_InternInPlace(PyObject **p)
15558{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015559 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015560#ifdef Py_DEBUG
15561 assert(s != NULL);
15562 assert(_PyUnicode_CHECK(s));
15563#else
Victor Stinner607b1022020-05-05 18:50:30 +020015564 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015565 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015566 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015567#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015568
Benjamin Peterson14339b62009-01-31 16:36:08 +000015569 /* If it's a subclass, we don't really know what putting
15570 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015571 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015572 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015573 }
15574
15575 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015576 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015577 }
15578
15579#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015580 if (interned == NULL) {
15581 interned = PyDict_New();
15582 if (interned == NULL) {
15583 PyErr_Clear(); /* Don't leave an exception */
15584 return;
15585 }
15586 }
Victor Stinner607b1022020-05-05 18:50:30 +020015587
15588 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015589 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015590 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015591 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015592
Berker Peksagced8d4c2016-07-25 04:40:39 +030015593 if (t == NULL) {
15594 PyErr_Clear();
15595 return;
15596 }
Victor Stinner607b1022020-05-05 18:50:30 +020015597
Berker Peksagced8d4c2016-07-25 04:40:39 +030015598 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015599 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015600 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015601 return;
15602 }
Victor Stinner607b1022020-05-05 18:50:30 +020015603
Benjamin Peterson14339b62009-01-31 16:36:08 +000015604 /* The two references in interned are not counted by refcnt.
15605 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015606 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015607 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015608#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015609}
15610
15611void
15612PyUnicode_InternImmortal(PyObject **p)
15613{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015614 PyUnicode_InternInPlace(p);
15615 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015616 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015617 Py_INCREF(*p);
15618 }
Walter Dörwald16807132007-05-25 13:52:07 +000015619}
15620
15621PyObject *
15622PyUnicode_InternFromString(const char *cp)
15623{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015624 PyObject *s = PyUnicode_FromString(cp);
15625 if (s == NULL)
15626 return NULL;
15627 PyUnicode_InternInPlace(&s);
15628 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015629}
15630
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015631
15632#if defined(WITH_VALGRIND) || defined(__INSURE__)
15633static void
15634unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015635{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015636 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015637 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015638 }
15639 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015640 if (keys == NULL || !PyList_Check(keys)) {
15641 PyErr_Clear();
15642 return;
15643 }
Walter Dörwald16807132007-05-25 13:52:07 +000015644
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015645 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015646 detector, interned unicode strings are not forcibly deallocated;
15647 rather, we give them their stolen references back, and then clear
15648 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015649
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015650 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015651#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015652 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015653 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015654
15655 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015656#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015657 for (Py_ssize_t i = 0; i < n; i++) {
15658 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015659 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015660 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015662 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015663 case SSTATE_INTERNED_IMMORTAL:
15664 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015665#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015666 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015667#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015668 break;
15669 case SSTATE_INTERNED_MORTAL:
15670 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015671#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015672 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015673#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015674 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015675 case SSTATE_NOT_INTERNED:
15676 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015677 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015678 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015680 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015681 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015682#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015683 fprintf(stderr, "total size of all interned strings: "
15684 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15685 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015686#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015687 Py_DECREF(keys);
15688 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015689 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015690}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015691#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015692
15693
15694/********************* Unicode Iterator **************************/
15695
15696typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015697 PyObject_HEAD
15698 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015699 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015700} unicodeiterobject;
15701
15702static void
15703unicodeiter_dealloc(unicodeiterobject *it)
15704{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015705 _PyObject_GC_UNTRACK(it);
15706 Py_XDECREF(it->it_seq);
15707 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015708}
15709
15710static int
15711unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15712{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015713 Py_VISIT(it->it_seq);
15714 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015715}
15716
15717static PyObject *
15718unicodeiter_next(unicodeiterobject *it)
15719{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015720 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015721
Benjamin Peterson14339b62009-01-31 16:36:08 +000015722 assert(it != NULL);
15723 seq = it->it_seq;
15724 if (seq == NULL)
15725 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015726 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015728 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15729 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015730 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015731 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15732 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015733 if (item != NULL)
15734 ++it->it_index;
15735 return item;
15736 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015737
Benjamin Peterson14339b62009-01-31 16:36:08 +000015738 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015739 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015740 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015741}
15742
15743static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015744unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015745{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015746 Py_ssize_t len = 0;
15747 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015748 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015749 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015750}
15751
15752PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15753
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015754static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015755unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015756{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015757 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015758 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015759 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015760 it->it_seq, it->it_index);
15761 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015762 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015763 if (u == NULL)
15764 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015765 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015766 }
15767}
15768
15769PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15770
15771static PyObject *
15772unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15773{
15774 Py_ssize_t index = PyLong_AsSsize_t(state);
15775 if (index == -1 && PyErr_Occurred())
15776 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015777 if (it->it_seq != NULL) {
15778 if (index < 0)
15779 index = 0;
15780 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15781 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15782 it->it_index = index;
15783 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015784 Py_RETURN_NONE;
15785}
15786
15787PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15788
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015789static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015790 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015791 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015792 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15793 reduce_doc},
15794 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15795 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015796 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015797};
15798
15799PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015800 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15801 "str_iterator", /* tp_name */
15802 sizeof(unicodeiterobject), /* tp_basicsize */
15803 0, /* tp_itemsize */
15804 /* methods */
15805 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015806 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015807 0, /* tp_getattr */
15808 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015809 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015810 0, /* tp_repr */
15811 0, /* tp_as_number */
15812 0, /* tp_as_sequence */
15813 0, /* tp_as_mapping */
15814 0, /* tp_hash */
15815 0, /* tp_call */
15816 0, /* tp_str */
15817 PyObject_GenericGetAttr, /* tp_getattro */
15818 0, /* tp_setattro */
15819 0, /* tp_as_buffer */
15820 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15821 0, /* tp_doc */
15822 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15823 0, /* tp_clear */
15824 0, /* tp_richcompare */
15825 0, /* tp_weaklistoffset */
15826 PyObject_SelfIter, /* tp_iter */
15827 (iternextfunc)unicodeiter_next, /* tp_iternext */
15828 unicodeiter_methods, /* tp_methods */
15829 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015830};
15831
15832static PyObject *
15833unicode_iter(PyObject *seq)
15834{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015835 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015836
Benjamin Peterson14339b62009-01-31 16:36:08 +000015837 if (!PyUnicode_Check(seq)) {
15838 PyErr_BadInternalCall();
15839 return NULL;
15840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015841 if (PyUnicode_READY(seq) == -1)
15842 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015843 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15844 if (it == NULL)
15845 return NULL;
15846 it->it_index = 0;
15847 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015848 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015849 _PyObject_GC_TRACK(it);
15850 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015851}
15852
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015853
15854size_t
15855Py_UNICODE_strlen(const Py_UNICODE *u)
15856{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015857 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015858}
15859
15860Py_UNICODE*
15861Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15862{
15863 Py_UNICODE *u = s1;
15864 while ((*u++ = *s2++));
15865 return s1;
15866}
15867
15868Py_UNICODE*
15869Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15870{
15871 Py_UNICODE *u = s1;
15872 while ((*u++ = *s2++))
15873 if (n-- == 0)
15874 break;
15875 return s1;
15876}
15877
15878Py_UNICODE*
15879Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15880{
15881 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015882 u1 += wcslen(u1);
15883 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015884 return s1;
15885}
15886
15887int
15888Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15889{
15890 while (*s1 && *s2 && *s1 == *s2)
15891 s1++, s2++;
15892 if (*s1 && *s2)
15893 return (*s1 < *s2) ? -1 : +1;
15894 if (*s1)
15895 return 1;
15896 if (*s2)
15897 return -1;
15898 return 0;
15899}
15900
15901int
15902Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15903{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015904 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015905 for (; n != 0; n--) {
15906 u1 = *s1;
15907 u2 = *s2;
15908 if (u1 != u2)
15909 return (u1 < u2) ? -1 : +1;
15910 if (u1 == '\0')
15911 return 0;
15912 s1++;
15913 s2++;
15914 }
15915 return 0;
15916}
15917
15918Py_UNICODE*
15919Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15920{
15921 const Py_UNICODE *p;
15922 for (p = s; *p; p++)
15923 if (*p == c)
15924 return (Py_UNICODE*)p;
15925 return NULL;
15926}
15927
15928Py_UNICODE*
15929Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15930{
15931 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015932 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015933 while (p != s) {
15934 p--;
15935 if (*p == c)
15936 return (Py_UNICODE*)p;
15937 }
15938 return NULL;
15939}
Victor Stinner331ea922010-08-10 16:37:20 +000015940
Victor Stinner71133ff2010-09-01 23:43:53 +000015941Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015942PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015943{
Victor Stinner577db2c2011-10-11 22:12:48 +020015944 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015945 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015947 if (!PyUnicode_Check(unicode)) {
15948 PyErr_BadArgument();
15949 return NULL;
15950 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015951 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015952 if (u == NULL)
15953 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015954 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015955 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015956 PyErr_NoMemory();
15957 return NULL;
15958 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015959 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015960 size *= sizeof(Py_UNICODE);
15961 copy = PyMem_Malloc(size);
15962 if (copy == NULL) {
15963 PyErr_NoMemory();
15964 return NULL;
15965 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015966 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015967 return copy;
15968}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015969
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015970
Victor Stinner709d23d2019-05-02 14:56:30 -040015971static int
15972encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015973{
Victor Stinner709d23d2019-05-02 14:56:30 -040015974 int res;
15975 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15976 if (res == -2) {
15977 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15978 return -1;
15979 }
15980 if (res < 0) {
15981 PyErr_NoMemory();
15982 return -1;
15983 }
15984 return 0;
15985}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015986
Victor Stinner709d23d2019-05-02 14:56:30 -040015987
15988static int
15989config_get_codec_name(wchar_t **config_encoding)
15990{
15991 char *encoding;
15992 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15993 return -1;
15994 }
15995
15996 PyObject *name_obj = NULL;
15997 PyObject *codec = _PyCodec_Lookup(encoding);
15998 PyMem_RawFree(encoding);
15999
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016000 if (!codec)
16001 goto error;
16002
16003 name_obj = PyObject_GetAttrString(codec, "name");
16004 Py_CLEAR(codec);
16005 if (!name_obj) {
16006 goto error;
16007 }
16008
Victor Stinner709d23d2019-05-02 14:56:30 -040016009 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16010 Py_DECREF(name_obj);
16011 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016012 goto error;
16013 }
16014
Victor Stinner709d23d2019-05-02 14:56:30 -040016015 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16016 if (raw_wname == NULL) {
16017 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016018 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016019 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016020 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016021
16022 PyMem_RawFree(*config_encoding);
16023 *config_encoding = raw_wname;
16024
16025 PyMem_Free(wname);
16026 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016027
16028error:
16029 Py_XDECREF(codec);
16030 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016031 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016032}
16033
16034
Victor Stinner331a6a52019-05-27 16:39:22 +020016035static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016036init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016037{
Victor Stinner709d23d2019-05-02 14:56:30 -040016038 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016039 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016040 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016041 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016042 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016043 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016044 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016045}
16046
16047
Victor Stinner709d23d2019-05-02 14:56:30 -040016048static int
16049init_fs_codec(PyInterpreterState *interp)
16050{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016051 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016052
16053 _Py_error_handler error_handler;
16054 error_handler = get_error_handler_wide(config->filesystem_errors);
16055 if (error_handler == _Py_ERROR_UNKNOWN) {
16056 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16057 return -1;
16058 }
16059
16060 char *encoding, *errors;
16061 if (encode_wstr_utf8(config->filesystem_encoding,
16062 &encoding,
16063 "filesystem_encoding") < 0) {
16064 return -1;
16065 }
16066
16067 if (encode_wstr_utf8(config->filesystem_errors,
16068 &errors,
16069 "filesystem_errors") < 0) {
16070 PyMem_RawFree(encoding);
16071 return -1;
16072 }
16073
16074 PyMem_RawFree(interp->fs_codec.encoding);
16075 interp->fs_codec.encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016076 /* encoding has been normalized by init_fs_encoding() */
16077 interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
Victor Stinner709d23d2019-05-02 14:56:30 -040016078 PyMem_RawFree(interp->fs_codec.errors);
16079 interp->fs_codec.errors = errors;
16080 interp->fs_codec.error_handler = error_handler;
16081
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016082#ifdef _Py_FORCE_UTF8_FS_ENCODING
16083 assert(interp->fs_codec.utf8 == 1);
16084#endif
16085
Victor Stinner709d23d2019-05-02 14:56:30 -040016086 /* At this point, PyUnicode_EncodeFSDefault() and
16087 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16088 the C implementation of the filesystem encoding. */
16089
16090 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16091 global configuration variables. */
16092 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
16093 interp->fs_codec.errors) < 0) {
16094 PyErr_NoMemory();
16095 return -1;
16096 }
16097 return 0;
16098}
16099
16100
Victor Stinner331a6a52019-05-27 16:39:22 +020016101static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016102init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016103{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016104 PyInterpreterState *interp = tstate->interp;
16105
Victor Stinner709d23d2019-05-02 14:56:30 -040016106 /* Update the filesystem encoding to the normalized Python codec name.
16107 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16108 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016109 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016110 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016111 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016112 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016113 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016114 }
16115
Victor Stinner709d23d2019-05-02 14:56:30 -040016116 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016117 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016118 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016119 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016120}
16121
16122
Victor Stinner331a6a52019-05-27 16:39:22 +020016123PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016124_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016125{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016126 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016127 if (_PyStatus_EXCEPTION(status)) {
16128 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016129 }
16130
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016131 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016132}
16133
16134
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016135static void
16136_PyUnicode_FiniEncodings(PyThreadState *tstate)
16137{
16138 PyInterpreterState *interp = tstate->interp;
16139 PyMem_RawFree(interp->fs_codec.encoding);
16140 interp->fs_codec.encoding = NULL;
16141 interp->fs_codec.utf8 = 0;
16142 PyMem_RawFree(interp->fs_codec.errors);
16143 interp->fs_codec.errors = NULL;
16144 interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
16145}
16146
16147
Victor Stinner709d23d2019-05-02 14:56:30 -040016148#ifdef MS_WINDOWS
16149int
16150_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16151{
Victor Stinner81a7be32020-04-14 15:14:01 +020016152 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016153 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016154
16155 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16156 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16157 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16158 if (encoding == NULL || errors == NULL) {
16159 PyMem_RawFree(encoding);
16160 PyMem_RawFree(errors);
16161 PyErr_NoMemory();
16162 return -1;
16163 }
16164
16165 PyMem_RawFree(config->filesystem_encoding);
16166 config->filesystem_encoding = encoding;
16167 PyMem_RawFree(config->filesystem_errors);
16168 config->filesystem_errors = errors;
16169
16170 return init_fs_codec(interp);
16171}
16172#endif
16173
16174
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016175void
Victor Stinner3d483342019-11-22 12:27:50 +010016176_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016177{
Victor Stinner3d483342019-11-22 12:27:50 +010016178 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016179#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016180 /* Insure++ is a memory analysis tool that aids in discovering
16181 * memory leaks and other memory problems. On Python exit, the
16182 * interned string dictionaries are flagged as being in use at exit
16183 * (which it is). Under normal circumstances, this is fine because
16184 * the memory will be automatically reclaimed by the system. Under
16185 * memory debugging, it's a huge source of useless noise, so we
16186 * trade off slower shutdown for less distraction in the memory
16187 * reports. -baw
16188 */
16189 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016190#endif /* __INSURE__ */
16191
Victor Stinner3d483342019-11-22 12:27:50 +010016192 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016193
Victor Stinner607b1022020-05-05 18:50:30 +020016194#ifdef LATIN1_SINGLETONS
Victor Stinner3d483342019-11-22 12:27:50 +010016195 for (Py_ssize_t i = 0; i < 256; i++) {
16196 Py_CLEAR(unicode_latin1[i]);
16197 }
Victor Stinner607b1022020-05-05 18:50:30 +020016198#endif
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016199 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016200 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016201
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016202 _PyUnicode_FiniEncodings(tstate);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016203}
16204
16205
Georg Brandl66c221e2010-10-14 07:04:07 +000016206/* A _string module, to export formatter_parser and formatter_field_name_split
16207 to the string.Formatter class implemented in Python. */
16208
16209static PyMethodDef _string_methods[] = {
16210 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16211 METH_O, PyDoc_STR("split the argument as a field name")},
16212 {"formatter_parser", (PyCFunction) formatter_parser,
16213 METH_O, PyDoc_STR("parse the argument as a format string")},
16214 {NULL, NULL}
16215};
16216
16217static struct PyModuleDef _string_module = {
16218 PyModuleDef_HEAD_INIT,
16219 "_string",
16220 PyDoc_STR("string helper module"),
16221 0,
16222 _string_methods,
16223 NULL,
16224 NULL,
16225 NULL,
16226 NULL
16227};
16228
16229PyMODINIT_FUNC
16230PyInit__string(void)
16231{
16232 return PyModule_Create(&_string_module);
16233}
16234
16235
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016236#ifdef __cplusplus
16237}
16238#endif