blob: 7ab0c882db0490d1bbfff53f7975701005606e23 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020047#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinnerbcda8f12018-11-21 22:27:47 +010048#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020049#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040050#include "pycore_pylifecycle.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020051#include "pycore_pystate.h" // _PyInterpreterState_GET()
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000052#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070053#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000055#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000056#include <windows.h>
57#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000058
Victor Stinnerfecc4f22019-03-19 14:20:29 +010059/* Uncomment to display statistics on interned strings at exit when
60 using Valgrind or Insecure++. */
61/* #define INTERNED_STATS 1 */
62
63
Larry Hastings61272b72014-01-07 12:41:53 -080064/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090065class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080066[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090067/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69/*[python input]
70class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80[python start generated code]*/
81/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080082
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchaka05997252013-01-26 12:14:02 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner8faf8212011-12-08 22:14:11 +010096/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97#define MAX_UNICODE 0x10ffff
98
Victor Stinner910337b2011-10-03 03:20:16 +020099#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200100# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#else
102# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200104
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105#define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200108 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200113#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200116 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200121#define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
123#define _PyUnicode_WSTR_LENGTH(op) \
124 (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) \
126 (((PyASCIIObject *)(op))->length)
127#define _PyUnicode_STATE(op) \
128 (((PyASCIIObject *)(op))->state)
129#define _PyUnicode_HASH(op) \
130 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_KIND(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200134#define _PyUnicode_GET_LENGTH(op) \
135 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200137#define _PyUnicode_DATA_ANY(op) \
138 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139
Victor Stinner910337b2011-10-03 03:20:16 +0200140#undef PyUnicode_READY
141#define PyUnicode_READY(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200144 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100145 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200146
Victor Stinnerc379ead2011-10-03 12:52:27 +0200147#define _PyUnicode_SHARE_UTF8(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
150 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
151#define _PyUnicode_SHARE_WSTR(op) \
152 (assert(_PyUnicode_CHECK(op)), \
153 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
154
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155/* true if the Unicode object has an allocated UTF-8 memory block
156 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200157#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200158 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200159 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200160 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
161
Victor Stinner03490912011-10-03 23:45:12 +0200162/* true if the Unicode object has an allocated wstr memory block
163 (not shared with other data) */
164#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200165 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100176 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600177 const from_type *_iter = (const from_type *)(begin);\
178 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 Py_ssize_t n = (_end) - (_iter); \
180 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200181 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200182 while (_iter < (_unrolled_end)) { \
183 _to[0] = (to_type) _iter[0]; \
184 _to[1] = (to_type) _iter[1]; \
185 _to[2] = (to_type) _iter[2]; \
186 _to[3] = (to_type) _iter[3]; \
187 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200189 while (_iter < (_end)) \
190 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200191 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200192
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200193#ifdef MS_WINDOWS
194 /* On Windows, overallocate by 50% is the best factor */
195# define OVERALLOCATE_FACTOR 2
196#else
197 /* On Linux, overallocate by 25% is the best factor */
198# define OVERALLOCATE_FACTOR 4
199#endif
200
Victor Stinner607b1022020-05-05 18:50:30 +0200201/* bpo-40521: Interned strings are shared by all interpreters. */
202#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
203# define INTERNED_STRINGS
204#endif
205
Walter Dörwald16807132007-05-25 13:52:07 +0000206/* This dictionary holds all interned unicode strings. Note that references
207 to strings in this dictionary are *not* counted in the string's ob_refcnt.
208 When the interned string reaches a refcnt of 0 the string deallocation
209 function will delete the reference from this dictionary.
210
211 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000212 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000213*/
Victor Stinner607b1022020-05-05 18:50:30 +0200214#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200215static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200216#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000217
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000218/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200220
Serhiy Storchaka678db842013-01-26 12:16:36 +0200221#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 do { \
223 if (unicode_empty != NULL) \
224 Py_INCREF(unicode_empty); \
225 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226 unicode_empty = PyUnicode_New(0, 0); \
227 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200228 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200229 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
230 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200231 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200232 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000233
Serhiy Storchaka678db842013-01-26 12:16:36 +0200234#define _Py_RETURN_UNICODE_EMPTY() \
235 do { \
236 _Py_INCREF_UNICODE_EMPTY(); \
237 return unicode_empty; \
238 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000239
Victor Stinner59423e32018-11-26 13:40:01 +0100240static inline void
241unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
242 Py_ssize_t start, Py_ssize_t length)
243{
244 assert(0 <= start);
245 assert(kind != PyUnicode_WCHAR_KIND);
246 switch (kind) {
247 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100248 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100249 Py_UCS1 ch = (unsigned char)value;
250 Py_UCS1 *to = (Py_UCS1 *)data + start;
251 memset(to, ch, length);
252 break;
253 }
254 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS2 ch = (Py_UCS2)value;
257 Py_UCS2 *to = (Py_UCS2 *)data + start;
258 const Py_UCS2 *end = to + length;
259 for (; to < end; ++to) *to = ch;
260 break;
261 }
262 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100263 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100264 Py_UCS4 ch = value;
265 Py_UCS4 * to = (Py_UCS4 *)data + start;
266 const Py_UCS4 *end = to + length;
267 for (; to < end; ++to) *to = ch;
268 break;
269 }
270 default: Py_UNREACHABLE();
271 }
272}
273
274
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200275/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700276static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900278static inline void
279_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400280static PyObject *
281unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
282 const char *errors);
283static PyObject *
284unicode_decode_utf8(const char *s, Py_ssize_t size,
285 _Py_error_handler error_handler, const char *errors,
286 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200287
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200288/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200289static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200290
Victor Stinner607b1022020-05-05 18:50:30 +0200291/* bpo-40521: Latin1 singletons are shared by all interpreters. */
292#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
293# define LATIN1_SINGLETONS
294#endif
295
296#ifdef LATIN1_SINGLETONS
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297/* Single character Unicode strings in the Latin-1 range are being
298 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200299static PyObject *unicode_latin1[256] = {NULL};
Victor Stinner607b1022020-05-05 18:50:30 +0200300#endif
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000301
Christian Heimes190d79e2008-01-30 11:58:22 +0000302/* Fast detection of the most frequent whitespace characters */
303const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000305/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000306/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000307/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000308/* case 0x000C: * FORM FEED */
309/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000310 0, 1, 1, 1, 1, 1, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000312/* case 0x001C: * FILE SEPARATOR */
313/* case 0x001D: * GROUP SEPARATOR */
314/* case 0x001E: * RECORD SEPARATOR */
315/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000316 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 1, 0, 0, 0, 0, 0, 0, 0,
319 0, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000322
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 0, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000331};
332
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200333/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200334static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200335static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100336static int unicode_modifiable(PyObject *unicode);
337
Victor Stinnerfe226c02011-10-03 03:52:20 +0200338
Alexander Belopolsky40018472011-02-26 01:02:56 +0000339static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100340_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200341static PyObject *
342_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
343static PyObject *
344_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
345
346static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000347unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000348 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100349 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000350 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
351
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352static void
353raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300354 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100355 PyObject *unicode,
356 Py_ssize_t startpos, Py_ssize_t endpos,
357 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000358
Christian Heimes190d79e2008-01-30 11:58:22 +0000359/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200360static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000362/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000363/* 0x000B, * LINE TABULATION */
364/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000365/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000366 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000367 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000368/* 0x001C, * FILE SEPARATOR */
369/* 0x001D, * GROUP SEPARATOR */
370/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000371 0, 0, 0, 0, 1, 1, 1, 0,
372 0, 0, 0, 0, 0, 0, 0, 0,
373 0, 0, 0, 0, 0, 0, 0, 0,
374 0, 0, 0, 0, 0, 0, 0, 0,
375 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000376
Benjamin Peterson14339b62009-01-31 16:36:08 +0000377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000385};
386
INADA Naoki3ae20562017-01-16 20:41:20 +0900387static int convert_uc(PyObject *obj, void *addr);
388
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300389#include "clinic/unicodeobject.c.h"
390
Victor Stinner3d4226a2018-08-29 22:21:32 +0200391_Py_error_handler
392_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200393{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200395 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200396 }
397 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 }
400 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200401 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200402 }
403 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200404 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200405 }
406 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200407 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200408 }
409 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200410 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200411 }
412 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200413 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200414 }
Victor Stinner50149202015-09-22 00:26:54 +0200415 return _Py_ERROR_OTHER;
416}
417
Victor Stinner709d23d2019-05-02 14:56:30 -0400418
419static _Py_error_handler
420get_error_handler_wide(const wchar_t *errors)
421{
422 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
423 return _Py_ERROR_STRICT;
424 }
425 if (wcscmp(errors, L"surrogateescape") == 0) {
426 return _Py_ERROR_SURROGATEESCAPE;
427 }
428 if (wcscmp(errors, L"replace") == 0) {
429 return _Py_ERROR_REPLACE;
430 }
431 if (wcscmp(errors, L"ignore") == 0) {
432 return _Py_ERROR_IGNORE;
433 }
434 if (wcscmp(errors, L"backslashreplace") == 0) {
435 return _Py_ERROR_BACKSLASHREPLACE;
436 }
437 if (wcscmp(errors, L"surrogatepass") == 0) {
438 return _Py_ERROR_SURROGATEPASS;
439 }
440 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
441 return _Py_ERROR_XMLCHARREFREPLACE;
442 }
443 return _Py_ERROR_OTHER;
444}
445
446
Victor Stinner22eb6892019-06-26 00:51:05 +0200447static inline int
448unicode_check_encoding_errors(const char *encoding, const char *errors)
449{
450 if (encoding == NULL && errors == NULL) {
451 return 0;
452 }
453
Victor Stinner81a7be32020-04-14 15:14:01 +0200454 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200455#ifndef Py_DEBUG
456 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200457 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200458 return 0;
459 }
460#else
461 /* Always check in debug mode */
462#endif
463
464 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
465 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200466 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200467 return 0;
468 }
469
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200470 /* Disable checks during Python finalization. For example, it allows to
471 call _PyObject_Dump() during finalization for debugging purpose. */
472 if (interp->finalizing) {
473 return 0;
474 }
475
Victor Stinner22eb6892019-06-26 00:51:05 +0200476 if (encoding != NULL) {
477 PyObject *handler = _PyCodec_Lookup(encoding);
478 if (handler == NULL) {
479 return -1;
480 }
481 Py_DECREF(handler);
482 }
483
484 if (errors != NULL) {
485 PyObject *handler = PyCodec_LookupError(errors);
486 if (handler == NULL) {
487 return -1;
488 }
489 Py_DECREF(handler);
490 }
491 return 0;
492}
493
494
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300495/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
496 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000497Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000498PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000499{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000500#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000501 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000502#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000503 /* This is actually an illegal character, so it should
504 not be passed to unichr. */
505 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000506#endif
507}
508
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200509int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100510_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200511{
Victor Stinner68762572019-10-07 18:42:01 +0200512#define CHECK(expr) \
513 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
514
Victor Stinner910337b2011-10-03 03:20:16 +0200515 PyASCIIObject *ascii;
516 unsigned int kind;
517
Victor Stinner68762572019-10-07 18:42:01 +0200518 assert(op != NULL);
519 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200520
521 ascii = (PyASCIIObject *)op;
522 kind = ascii->state.kind;
523
Victor Stinnera3b334d2011-10-03 13:53:37 +0200524 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND);
526 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200527 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200528 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200529 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200530 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200531
Victor Stinnera41463c2011-10-04 01:05:08 +0200532 if (ascii->state.compact == 1) {
533 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200534 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200535 || kind == PyUnicode_2BYTE_KIND
536 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->state.ascii == 0);
538 CHECK(ascii->state.ready == 1);
539 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100540 }
541 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
543
544 data = unicode->data.any;
545 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(ascii->length == 0);
547 CHECK(ascii->hash == -1);
548 CHECK(ascii->state.compact == 0);
549 CHECK(ascii->state.ascii == 0);
550 CHECK(ascii->state.ready == 0);
551 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
552 CHECK(ascii->wstr != NULL);
553 CHECK(data == NULL);
554 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200555 }
556 else {
Victor Stinner68762572019-10-07 18:42:01 +0200557 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200558 || kind == PyUnicode_2BYTE_KIND
559 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200560 CHECK(ascii->state.compact == 0);
561 CHECK(ascii->state.ready == 1);
562 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200563 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200564 CHECK(compact->utf8 == data);
565 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 }
567 else
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200569 }
570 }
571 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200572 if (
573#if SIZEOF_WCHAR_T == 2
574 kind == PyUnicode_2BYTE_KIND
575#else
576 kind == PyUnicode_4BYTE_KIND
577#endif
578 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 {
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(ascii->wstr == data);
581 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200582 } else
Victor Stinner68762572019-10-07 18:42:01 +0200583 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200584 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200585
586 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200587 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200588 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200589 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200590 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200591
592 /* check that the best kind is used: O(n) operation */
593 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 Py_ssize_t i;
595 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300596 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200597 Py_UCS4 ch;
598
599 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200600 for (i=0; i < ascii->length; i++)
601 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200602 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200603 if (ch > maxchar)
604 maxchar = ch;
605 }
606 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100607 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200608 CHECK(maxchar >= 128);
609 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100610 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200611 else
Victor Stinner68762572019-10-07 18:42:01 +0200612 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200613 }
Victor Stinner77faf692011-11-20 18:56:05 +0100614 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200615 CHECK(maxchar >= 0x100);
616 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100617 }
618 else {
Victor Stinner68762572019-10-07 18:42:01 +0200619 CHECK(maxchar >= 0x10000);
620 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100621 }
Victor Stinner68762572019-10-07 18:42:01 +0200622 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200623 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400624 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200625
626#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400627}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200628
Victor Stinner910337b2011-10-03 03:20:16 +0200629
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100630static PyObject*
631unicode_result_wchar(PyObject *unicode)
632{
633#ifndef Py_DEBUG
634 Py_ssize_t len;
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 len = _PyUnicode_WSTR_LENGTH(unicode);
637 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200639 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100640 }
641
642 if (len == 1) {
643 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100644 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100645 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
646 Py_DECREF(unicode);
647 return latin1_char;
648 }
649 }
650
651 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200652 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 return NULL;
654 }
655#else
Victor Stinneraa771272012-10-04 02:32:58 +0200656 assert(Py_REFCNT(unicode) == 1);
657
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100658 /* don't make the result ready in debug mode to ensure that the caller
659 makes the string ready before using it */
660 assert(_PyUnicode_CheckConsistency(unicode, 1));
661#endif
662 return unicode;
663}
664
665static PyObject*
666unicode_result_ready(PyObject *unicode)
667{
668 Py_ssize_t length;
669
670 length = PyUnicode_GET_LENGTH(unicode);
671 if (length == 0) {
672 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100673 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200674 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100675 }
676 return unicode_empty;
677 }
678
Victor Stinner607b1022020-05-05 18:50:30 +0200679#ifdef LATIN1_SINGLETONS
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100680 if (length == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300681 const void *data = PyUnicode_DATA(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +0200682 int kind = PyUnicode_KIND(unicode);
683 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100684 if (ch < 256) {
685 PyObject *latin1_char = unicode_latin1[ch];
686 if (latin1_char != NULL) {
687 if (unicode != latin1_char) {
688 Py_INCREF(latin1_char);
689 Py_DECREF(unicode);
690 }
691 return latin1_char;
692 }
693 else {
694 assert(_PyUnicode_CheckConsistency(unicode, 1));
695 Py_INCREF(unicode);
696 unicode_latin1[ch] = unicode;
697 return unicode;
698 }
699 }
700 }
Victor Stinner607b1022020-05-05 18:50:30 +0200701#endif
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100702
703 assert(_PyUnicode_CheckConsistency(unicode, 1));
704 return unicode;
705}
706
707static PyObject*
708unicode_result(PyObject *unicode)
709{
710 assert(_PyUnicode_CHECK(unicode));
711 if (PyUnicode_IS_READY(unicode))
712 return unicode_result_ready(unicode);
713 else
714 return unicode_result_wchar(unicode);
715}
716
Victor Stinnerc4b49542011-12-11 22:44:26 +0100717static PyObject*
718unicode_result_unchanged(PyObject *unicode)
719{
720 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500721 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722 return NULL;
723 Py_INCREF(unicode);
724 return unicode;
725 }
726 else
727 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100728 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100729}
730
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
732 ASCII, Latin1, UTF-8, etc. */
733static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200734backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
736{
Victor Stinnerad771582015-10-09 12:38:53 +0200737 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200738 Py_UCS4 ch;
739 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300740 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200741
742 assert(PyUnicode_IS_READY(unicode));
743 kind = PyUnicode_KIND(unicode);
744 data = PyUnicode_DATA(unicode);
745
746 size = 0;
747 /* determine replacement size */
748 for (i = collstart; i < collend; ++i) {
749 Py_ssize_t incr;
750
751 ch = PyUnicode_READ(kind, data, i);
752 if (ch < 0x100)
753 incr = 2+2;
754 else if (ch < 0x10000)
755 incr = 2+4;
756 else {
757 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200758 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200759 }
760 if (size > PY_SSIZE_T_MAX - incr) {
761 PyErr_SetString(PyExc_OverflowError,
762 "encoded result is too long for a Python string");
763 return NULL;
764 }
765 size += incr;
766 }
767
Victor Stinnerad771582015-10-09 12:38:53 +0200768 str = _PyBytesWriter_Prepare(writer, str, size);
769 if (str == NULL)
770 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771
772 /* generate replacement */
773 for (i = collstart; i < collend; ++i) {
774 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200775 *str++ = '\\';
776 if (ch >= 0x00010000) {
777 *str++ = 'U';
778 *str++ = Py_hexdigits[(ch>>28)&0xf];
779 *str++ = Py_hexdigits[(ch>>24)&0xf];
780 *str++ = Py_hexdigits[(ch>>20)&0xf];
781 *str++ = Py_hexdigits[(ch>>16)&0xf];
782 *str++ = Py_hexdigits[(ch>>12)&0xf];
783 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200784 }
Victor Stinner797485e2015-10-09 03:17:30 +0200785 else if (ch >= 0x100) {
786 *str++ = 'u';
787 *str++ = Py_hexdigits[(ch>>12)&0xf];
788 *str++ = Py_hexdigits[(ch>>8)&0xf];
789 }
790 else
791 *str++ = 'x';
792 *str++ = Py_hexdigits[(ch>>4)&0xf];
793 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200794 }
795 return str;
796}
797
798/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
799 ASCII, Latin1, UTF-8, etc. */
800static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200801xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200802 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
803{
Victor Stinnerad771582015-10-09 12:38:53 +0200804 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200805 Py_UCS4 ch;
806 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300807 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200808
809 assert(PyUnicode_IS_READY(unicode));
810 kind = PyUnicode_KIND(unicode);
811 data = PyUnicode_DATA(unicode);
812
813 size = 0;
814 /* determine replacement size */
815 for (i = collstart; i < collend; ++i) {
816 Py_ssize_t incr;
817
818 ch = PyUnicode_READ(kind, data, i);
819 if (ch < 10)
820 incr = 2+1+1;
821 else if (ch < 100)
822 incr = 2+2+1;
823 else if (ch < 1000)
824 incr = 2+3+1;
825 else if (ch < 10000)
826 incr = 2+4+1;
827 else if (ch < 100000)
828 incr = 2+5+1;
829 else if (ch < 1000000)
830 incr = 2+6+1;
831 else {
832 assert(ch <= MAX_UNICODE);
833 incr = 2+7+1;
834 }
835 if (size > PY_SSIZE_T_MAX - incr) {
836 PyErr_SetString(PyExc_OverflowError,
837 "encoded result is too long for a Python string");
838 return NULL;
839 }
840 size += incr;
841 }
842
Victor Stinnerad771582015-10-09 12:38:53 +0200843 str = _PyBytesWriter_Prepare(writer, str, size);
844 if (str == NULL)
845 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200846
847 /* generate replacement */
848 for (i = collstart; i < collend; ++i) {
849 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
850 }
851 return str;
852}
853
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854/* --- Bloom Filters ----------------------------------------------------- */
855
856/* stuff to implement simple "bloom filters" for Unicode characters.
857 to keep things simple, we use a single bitmask, using the least 5
858 bits from each unicode characters as the bit index. */
859
860/* the linebreak mask is set up by Unicode_Init below */
861
Antoine Pitrouf068f942010-01-13 14:19:12 +0000862#if LONG_BIT >= 128
863#define BLOOM_WIDTH 128
864#elif LONG_BIT >= 64
865#define BLOOM_WIDTH 64
866#elif LONG_BIT >= 32
867#define BLOOM_WIDTH 32
868#else
869#error "LONG_BIT is smaller than 32"
870#endif
871
Thomas Wouters477c8d52006-05-27 19:21:47 +0000872#define BLOOM_MASK unsigned long
873
Serhiy Storchaka05997252013-01-26 12:14:02 +0200874static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000875
Antoine Pitrouf068f942010-01-13 14:19:12 +0000876#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877
Benjamin Peterson29060642009-01-31 22:14:21 +0000878#define BLOOM_LINEBREAK(ch) \
879 ((ch) < 128U ? ascii_linebreak[(ch)] : \
880 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000881
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700882static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300883make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884{
Victor Stinnera85af502013-04-09 21:53:54 +0200885#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
886 do { \
887 TYPE *data = (TYPE *)PTR; \
888 TYPE *end = data + LEN; \
889 Py_UCS4 ch; \
890 for (; data != end; data++) { \
891 ch = *data; \
892 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
893 } \
894 break; \
895 } while (0)
896
Thomas Wouters477c8d52006-05-27 19:21:47 +0000897 /* calculate simple bloom-style bitmask for a given unicode string */
898
Antoine Pitrouf068f942010-01-13 14:19:12 +0000899 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000900
901 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200902 switch (kind) {
903 case PyUnicode_1BYTE_KIND:
904 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
905 break;
906 case PyUnicode_2BYTE_KIND:
907 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
908 break;
909 case PyUnicode_4BYTE_KIND:
910 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
911 break;
912 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700913 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200914 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000915 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200916
917#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000918}
919
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920static int
921ensure_unicode(PyObject *obj)
922{
923 if (!PyUnicode_Check(obj)) {
924 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200925 "must be str, not %.100s",
926 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300927 return -1;
928 }
929 return PyUnicode_READY(obj);
930}
931
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200932/* Compilation of templated routines */
933
934#include "stringlib/asciilib.h"
935#include "stringlib/fastsearch.h"
936#include "stringlib/partition.h"
937#include "stringlib/split.h"
938#include "stringlib/count.h"
939#include "stringlib/find.h"
940#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200941#include "stringlib/undef.h"
942
943#include "stringlib/ucs1lib.h"
944#include "stringlib/fastsearch.h"
945#include "stringlib/partition.h"
946#include "stringlib/split.h"
947#include "stringlib/count.h"
948#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300949#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200950#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200951#include "stringlib/undef.h"
952
953#include "stringlib/ucs2lib.h"
954#include "stringlib/fastsearch.h"
955#include "stringlib/partition.h"
956#include "stringlib/split.h"
957#include "stringlib/count.h"
958#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300959#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200960#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200961#include "stringlib/undef.h"
962
963#include "stringlib/ucs4lib.h"
964#include "stringlib/fastsearch.h"
965#include "stringlib/partition.h"
966#include "stringlib/split.h"
967#include "stringlib/count.h"
968#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300969#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200970#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200971#include "stringlib/undef.h"
972
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200973#include "stringlib/unicodedefs.h"
974#include "stringlib/fastsearch.h"
975#include "stringlib/count.h"
976#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100977#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 if (length == 0 && unicode_empty != NULL) {
1228 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001229 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 }
1231
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001232 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001233 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001234 return (PyUnicodeObject *)PyErr_NoMemory();
1235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001236 if (length < 0) {
1237 PyErr_SetString(PyExc_SystemError,
1238 "Negative size passed to _PyUnicode_New");
1239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 }
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1243 if (unicode == NULL)
1244 return NULL;
1245 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001246
1247 _PyUnicode_WSTR_LENGTH(unicode) = length;
1248 _PyUnicode_HASH(unicode) = -1;
1249 _PyUnicode_STATE(unicode).interned = 0;
1250 _PyUnicode_STATE(unicode).kind = 0;
1251 _PyUnicode_STATE(unicode).compact = 0;
1252 _PyUnicode_STATE(unicode).ready = 0;
1253 _PyUnicode_STATE(unicode).ascii = 0;
1254 _PyUnicode_DATA_ANY(unicode) = NULL;
1255 _PyUnicode_LENGTH(unicode) = 0;
1256 _PyUnicode_UTF8(unicode) = NULL;
1257 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1260 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001261 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001262 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001263 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001265
Jeremy Hyltond8082792003-09-16 19:41:39 +00001266 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001267 * the caller fails before initializing str -- unicode_resize()
1268 * reads str[0], and the Keep-Alive optimization can keep memory
1269 * allocated for str alive across a call to unicode_dealloc(unicode).
1270 * We don't want unicode_resize to read uninitialized memory in
1271 * that case.
1272 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 _PyUnicode_WSTR(unicode)[0] = 0;
1274 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001275
Victor Stinner7931d9a2011-11-04 00:22:48 +01001276 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 return unicode;
1278}
1279
Victor Stinnerf42dc442011-10-02 23:33:16 +02001280static const char*
1281unicode_kind_name(PyObject *unicode)
1282{
Victor Stinner42dfd712011-10-03 14:41:45 +02001283 /* don't check consistency: unicode_kind_name() is called from
1284 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001285 if (!PyUnicode_IS_COMPACT(unicode))
1286 {
1287 if (!PyUnicode_IS_READY(unicode))
1288 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001289 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001290 {
1291 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001292 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001293 return "legacy ascii";
1294 else
1295 return "legacy latin1";
1296 case PyUnicode_2BYTE_KIND:
1297 return "legacy UCS2";
1298 case PyUnicode_4BYTE_KIND:
1299 return "legacy UCS4";
1300 default:
1301 return "<legacy invalid kind>";
1302 }
1303 }
1304 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001305 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001306 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001307 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001308 return "ascii";
1309 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001310 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001311 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001312 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001313 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001314 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001315 default:
1316 return "<invalid compact kind>";
1317 }
1318}
1319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001322const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001323 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001324 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325}
1326
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001327const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001328 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329 return _PyUnicode_COMPACT_DATA(unicode);
1330}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001331const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001332 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001333 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1335 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1336 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1337 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1338 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1339 return PyUnicode_DATA(unicode);
1340}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001341
1342void
1343_PyUnicode_Dump(PyObject *op)
1344{
1345 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001346 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1347 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001348 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001349
Victor Stinnera849a4b2011-10-03 12:12:11 +02001350 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001351 {
1352 if (ascii->state.ascii)
1353 data = (ascii + 1);
1354 else
1355 data = (compact + 1);
1356 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001357 else
1358 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001359 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001360
Victor Stinnera849a4b2011-10-03 12:12:11 +02001361 if (ascii->wstr == data)
1362 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001363 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001364
Victor Stinnera3b334d2011-10-03 13:53:37 +02001365 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001366 printf(" (%zu), ", compact->wstr_length);
1367 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001368 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001369 }
1370 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001371 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001372 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001373}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374#endif
1375
1376PyObject *
1377PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1378{
1379 PyObject *obj;
1380 PyCompactUnicodeObject *unicode;
1381 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001382 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001383 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 Py_ssize_t char_size;
1385 Py_ssize_t struct_size;
1386
1387 /* Optimization for empty strings */
1388 if (size == 0 && unicode_empty != NULL) {
1389 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001390 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 }
1392
Victor Stinner9e9d6892011-10-04 01:02:02 +02001393 is_ascii = 0;
1394 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 struct_size = sizeof(PyCompactUnicodeObject);
1396 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001397 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398 char_size = 1;
1399 is_ascii = 1;
1400 struct_size = sizeof(PyASCIIObject);
1401 }
1402 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001403 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 char_size = 1;
1405 }
1406 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001407 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 char_size = 2;
1409 if (sizeof(wchar_t) == 2)
1410 is_sharing = 1;
1411 }
1412 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001413 if (maxchar > MAX_UNICODE) {
1414 PyErr_SetString(PyExc_SystemError,
1415 "invalid maximum character passed to PyUnicode_New");
1416 return NULL;
1417 }
Victor Stinner8f825062012-04-27 13:55:39 +02001418 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 char_size = 4;
1420 if (sizeof(wchar_t) == 4)
1421 is_sharing = 1;
1422 }
1423
1424 /* Ensure we won't overflow the size. */
1425 if (size < 0) {
1426 PyErr_SetString(PyExc_SystemError,
1427 "Negative size passed to PyUnicode_New");
1428 return NULL;
1429 }
1430 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1431 return PyErr_NoMemory();
1432
1433 /* Duplicated allocation code from _PyObject_New() instead of a call to
1434 * PyObject_New() so we are able to allocate space for the object and
1435 * it's data buffer.
1436 */
1437 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1438 if (obj == NULL)
1439 return PyErr_NoMemory();
1440 obj = PyObject_INIT(obj, &PyUnicode_Type);
1441 if (obj == NULL)
1442 return NULL;
1443
1444 unicode = (PyCompactUnicodeObject *)obj;
1445 if (is_ascii)
1446 data = ((PyASCIIObject*)obj) + 1;
1447 else
1448 data = unicode + 1;
1449 _PyUnicode_LENGTH(unicode) = size;
1450 _PyUnicode_HASH(unicode) = -1;
1451 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001452 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453 _PyUnicode_STATE(unicode).compact = 1;
1454 _PyUnicode_STATE(unicode).ready = 1;
1455 _PyUnicode_STATE(unicode).ascii = is_ascii;
1456 if (is_ascii) {
1457 ((char*)data)[size] = 0;
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 }
Victor Stinner8f825062012-04-27 13:55:39 +02001460 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461 ((char*)data)[size] = 0;
1462 _PyUnicode_WSTR(unicode) = NULL;
1463 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001465 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 else {
1468 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001469 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001470 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001472 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 ((Py_UCS4*)data)[size] = 0;
1474 if (is_sharing) {
1475 _PyUnicode_WSTR_LENGTH(unicode) = size;
1476 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1477 }
1478 else {
1479 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1480 _PyUnicode_WSTR(unicode) = NULL;
1481 }
1482 }
Victor Stinner8f825062012-04-27 13:55:39 +02001483#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001484 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001485#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001486 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 return obj;
1488}
1489
1490#if SIZEOF_WCHAR_T == 2
1491/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1492 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001493 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494
1495 This function assumes that unicode can hold one more code point than wstr
1496 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001497static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001499 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500{
1501 const wchar_t *iter;
1502 Py_UCS4 *ucs4_out;
1503
Victor Stinner910337b2011-10-03 03:20:16 +02001504 assert(unicode != NULL);
1505 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001506 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1507 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1508
1509 for (iter = begin; iter < end; ) {
1510 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1511 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001512 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1513 && (iter+1) < end
1514 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 {
Victor Stinner551ac952011-11-29 22:58:13 +01001516 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517 iter += 2;
1518 }
1519 else {
1520 *ucs4_out++ = *iter;
1521 iter++;
1522 }
1523 }
1524 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1525 _PyUnicode_GET_LENGTH(unicode)));
1526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527}
1528#endif
1529
Victor Stinnercd9950f2011-10-02 00:34:53 +02001530static int
Victor Stinner488fa492011-12-12 00:01:39 +01001531unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001532{
Victor Stinner488fa492011-12-12 00:01:39 +01001533 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001534 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001535 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001536 return -1;
1537 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001538 return 0;
1539}
1540
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001541static int
1542_copy_characters(PyObject *to, Py_ssize_t to_start,
1543 PyObject *from, Py_ssize_t from_start,
1544 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001545{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001546 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001547 const void *from_data;
1548 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549
Victor Stinneree4544c2012-05-09 22:24:08 +02001550 assert(0 <= how_many);
1551 assert(0 <= from_start);
1552 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001553 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001554 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001555 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001556
Victor Stinnerd3f08822012-05-29 12:57:52 +02001557 assert(PyUnicode_Check(to));
1558 assert(PyUnicode_IS_READY(to));
1559 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1560
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001561 if (how_many == 0)
1562 return 0;
1563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001565 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001567 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568
Victor Stinnerf1852262012-06-16 16:38:26 +02001569#ifdef Py_DEBUG
1570 if (!check_maxchar
1571 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1572 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001573 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001574 Py_UCS4 ch;
1575 Py_ssize_t i;
1576 for (i=0; i < how_many; i++) {
1577 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1578 assert(ch <= to_maxchar);
1579 }
1580 }
1581#endif
1582
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001583 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001584 if (check_maxchar
1585 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1586 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001587 /* Writing Latin-1 characters into an ASCII string requires to
1588 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001589 Py_UCS4 max_char;
1590 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001591 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001592 if (max_char >= 128)
1593 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001594 }
Christian Heimesf051e432016-09-13 20:22:02 +02001595 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001596 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001597 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001598 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001599 else if (from_kind == PyUnicode_1BYTE_KIND
1600 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001601 {
1602 _PyUnicode_CONVERT_BYTES(
1603 Py_UCS1, Py_UCS2,
1604 PyUnicode_1BYTE_DATA(from) + from_start,
1605 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1606 PyUnicode_2BYTE_DATA(to) + to_start
1607 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001608 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001609 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001610 && to_kind == PyUnicode_4BYTE_KIND)
1611 {
1612 _PyUnicode_CONVERT_BYTES(
1613 Py_UCS1, Py_UCS4,
1614 PyUnicode_1BYTE_DATA(from) + from_start,
1615 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1616 PyUnicode_4BYTE_DATA(to) + to_start
1617 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001618 }
1619 else if (from_kind == PyUnicode_2BYTE_KIND
1620 && to_kind == PyUnicode_4BYTE_KIND)
1621 {
1622 _PyUnicode_CONVERT_BYTES(
1623 Py_UCS2, Py_UCS4,
1624 PyUnicode_2BYTE_DATA(from) + from_start,
1625 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1626 PyUnicode_4BYTE_DATA(to) + to_start
1627 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001628 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001629 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001630 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1631
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001632 if (!check_maxchar) {
1633 if (from_kind == PyUnicode_2BYTE_KIND
1634 && to_kind == PyUnicode_1BYTE_KIND)
1635 {
1636 _PyUnicode_CONVERT_BYTES(
1637 Py_UCS2, Py_UCS1,
1638 PyUnicode_2BYTE_DATA(from) + from_start,
1639 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1640 PyUnicode_1BYTE_DATA(to) + to_start
1641 );
1642 }
1643 else if (from_kind == PyUnicode_4BYTE_KIND
1644 && to_kind == PyUnicode_1BYTE_KIND)
1645 {
1646 _PyUnicode_CONVERT_BYTES(
1647 Py_UCS4, Py_UCS1,
1648 PyUnicode_4BYTE_DATA(from) + from_start,
1649 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1650 PyUnicode_1BYTE_DATA(to) + to_start
1651 );
1652 }
1653 else if (from_kind == PyUnicode_4BYTE_KIND
1654 && to_kind == PyUnicode_2BYTE_KIND)
1655 {
1656 _PyUnicode_CONVERT_BYTES(
1657 Py_UCS4, Py_UCS2,
1658 PyUnicode_4BYTE_DATA(from) + from_start,
1659 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1660 PyUnicode_2BYTE_DATA(to) + to_start
1661 );
1662 }
1663 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001664 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001665 }
1666 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001667 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001668 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001669 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001670 Py_ssize_t i;
1671
Victor Stinnera0702ab2011-09-29 14:14:38 +02001672 for (i=0; i < how_many; i++) {
1673 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001674 if (ch > to_maxchar)
1675 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001676 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1677 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001678 }
1679 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001680 return 0;
1681}
1682
Victor Stinnerd3f08822012-05-29 12:57:52 +02001683void
1684_PyUnicode_FastCopyCharacters(
1685 PyObject *to, Py_ssize_t to_start,
1686 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001687{
1688 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1689}
1690
1691Py_ssize_t
1692PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1693 PyObject *from, Py_ssize_t from_start,
1694 Py_ssize_t how_many)
1695{
1696 int err;
1697
1698 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1699 PyErr_BadInternalCall();
1700 return -1;
1701 }
1702
Benjamin Petersonbac79492012-01-14 13:34:47 -05001703 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001704 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001705 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001706 return -1;
1707
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001708 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001709 PyErr_SetString(PyExc_IndexError, "string index out of range");
1710 return -1;
1711 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001712 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001713 PyErr_SetString(PyExc_IndexError, "string index out of range");
1714 return -1;
1715 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001716 if (how_many < 0) {
1717 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1718 return -1;
1719 }
1720 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001721 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1722 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001723 "Cannot write %zi characters at %zi "
1724 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001725 how_many, to_start, PyUnicode_GET_LENGTH(to));
1726 return -1;
1727 }
1728
1729 if (how_many == 0)
1730 return 0;
1731
Victor Stinner488fa492011-12-12 00:01:39 +01001732 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001733 return -1;
1734
1735 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1736 if (err) {
1737 PyErr_Format(PyExc_SystemError,
1738 "Cannot copy %s characters "
1739 "into a string of %s characters",
1740 unicode_kind_name(from),
1741 unicode_kind_name(to));
1742 return -1;
1743 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001744 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745}
1746
Victor Stinner17222162011-09-28 22:15:37 +02001747/* Find the maximum code point and count the number of surrogate pairs so a
1748 correct string length can be computed before converting a string to UCS4.
1749 This function counts single surrogates as a character and not as a pair.
1750
1751 Return 0 on success, or -1 on error. */
1752static int
1753find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1754 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755{
1756 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001757 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758
Victor Stinnerc53be962011-10-02 21:33:54 +02001759 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 *num_surrogates = 0;
1761 *maxchar = 0;
1762
1763 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001765 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1766 && (iter+1) < end
1767 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1768 {
1769 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1770 ++(*num_surrogates);
1771 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 }
1773 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001775 {
1776 ch = *iter;
1777 iter++;
1778 }
1779 if (ch > *maxchar) {
1780 *maxchar = ch;
1781 if (*maxchar > MAX_UNICODE) {
1782 PyErr_Format(PyExc_ValueError,
1783 "character U+%x is not in range [U+0000; U+10ffff]",
1784 ch);
1785 return -1;
1786 }
1787 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 }
1789 return 0;
1790}
1791
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001792int
1793_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794{
1795 wchar_t *end;
1796 Py_UCS4 maxchar = 0;
1797 Py_ssize_t num_surrogates;
1798#if SIZEOF_WCHAR_T == 2
1799 Py_ssize_t length_wo_surrogates;
1800#endif
1801
Georg Brandl7597add2011-10-05 16:36:47 +02001802 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001803 strings were created using _PyObject_New() and where no canonical
1804 representation (the str field) has been set yet aka strings
1805 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001806 assert(_PyUnicode_CHECK(unicode));
1807 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001809 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001810 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001811 /* Actually, it should neither be interned nor be anything else: */
1812 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001815 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001816 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818
1819 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001820 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1821 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 PyErr_NoMemory();
1823 return -1;
1824 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001825 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 _PyUnicode_WSTR(unicode), end,
1827 PyUnicode_1BYTE_DATA(unicode));
1828 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1829 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1830 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1831 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001832 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001833 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001834 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 }
1836 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001837 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001838 _PyUnicode_UTF8(unicode) = NULL;
1839 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 }
1841 PyObject_FREE(_PyUnicode_WSTR(unicode));
1842 _PyUnicode_WSTR(unicode) = NULL;
1843 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1844 }
1845 /* In this case we might have to convert down from 4-byte native
1846 wchar_t to 2-byte unicode. */
1847 else if (maxchar < 65536) {
1848 assert(num_surrogates == 0 &&
1849 "FindMaxCharAndNumSurrogatePairs() messed up");
1850
Victor Stinner506f5922011-09-28 22:34:18 +02001851#if SIZEOF_WCHAR_T == 2
1852 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001853 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001854 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1855 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1856 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001857 _PyUnicode_UTF8(unicode) = NULL;
1858 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001859#else
1860 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001861 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001862 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001863 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001864 PyErr_NoMemory();
1865 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 }
Victor Stinner506f5922011-09-28 22:34:18 +02001867 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1868 _PyUnicode_WSTR(unicode), end,
1869 PyUnicode_2BYTE_DATA(unicode));
1870 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1871 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1872 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001873 _PyUnicode_UTF8(unicode) = NULL;
1874 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001875 PyObject_FREE(_PyUnicode_WSTR(unicode));
1876 _PyUnicode_WSTR(unicode) = NULL;
1877 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1878#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 }
1880 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1881 else {
1882#if SIZEOF_WCHAR_T == 2
1883 /* in case the native representation is 2-bytes, we need to allocate a
1884 new normalized 4-byte version. */
1885 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001886 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1887 PyErr_NoMemory();
1888 return -1;
1889 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001890 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1891 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 PyErr_NoMemory();
1893 return -1;
1894 }
1895 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1896 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001897 _PyUnicode_UTF8(unicode) = NULL;
1898 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001899 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1900 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001901 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 PyObject_FREE(_PyUnicode_WSTR(unicode));
1903 _PyUnicode_WSTR(unicode) = NULL;
1904 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1905#else
1906 assert(num_surrogates == 0);
1907
Victor Stinnerc3c74152011-10-02 20:39:55 +02001908 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001910 _PyUnicode_UTF8(unicode) = NULL;
1911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1913#endif
1914 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1915 }
1916 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001917 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 return 0;
1919}
1920
Alexander Belopolsky40018472011-02-26 01:02:56 +00001921static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001922unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923{
Walter Dörwald16807132007-05-25 13:52:07 +00001924 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001925 case SSTATE_NOT_INTERNED:
1926 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001927
Benjamin Peterson29060642009-01-31 22:14:21 +00001928 case SSTATE_INTERNED_MORTAL:
1929 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001930 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001931#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001932 if (PyDict_DelItem(interned, unicode) != 0) {
1933 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1934 NULL);
1935 }
Victor Stinner607b1022020-05-05 18:50:30 +02001936#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001937 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001938
Benjamin Peterson29060642009-01-31 22:14:21 +00001939 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001940 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1941 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001942
Benjamin Peterson29060642009-01-31 22:14:21 +00001943 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001944 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001945 }
1946
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001947 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001949 }
1950 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001951 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001952 }
1953 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001954 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001957 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001958}
1959
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001960#ifdef Py_DEBUG
1961static int
1962unicode_is_singleton(PyObject *unicode)
1963{
Victor Stinner607b1022020-05-05 18:50:30 +02001964 if (unicode == unicode_empty) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001965 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001966 }
1967#ifdef LATIN1_SINGLETONS
1968 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001969 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1970 {
1971 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1972 if (ch < 256 && unicode_latin1[ch] == unicode)
1973 return 1;
1974 }
Victor Stinner607b1022020-05-05 18:50:30 +02001975#endif
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001976 return 0;
1977}
1978#endif
1979
Alexander Belopolsky40018472011-02-26 01:02:56 +00001980static int
Victor Stinner488fa492011-12-12 00:01:39 +01001981unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001982{
Victor Stinner488fa492011-12-12 00:01:39 +01001983 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001984 if (Py_REFCNT(unicode) != 1)
1985 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001986 if (_PyUnicode_HASH(unicode) != -1)
1987 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001988 if (PyUnicode_CHECK_INTERNED(unicode))
1989 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001990 if (!PyUnicode_CheckExact(unicode))
1991 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001992#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001993 /* singleton refcount is greater than 1 */
1994 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001995#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001996 return 1;
1997}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001998
Victor Stinnerfe226c02011-10-03 03:52:20 +02001999static int
2000unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2001{
2002 PyObject *unicode;
2003 Py_ssize_t old_length;
2004
2005 assert(p_unicode != NULL);
2006 unicode = *p_unicode;
2007
2008 assert(unicode != NULL);
2009 assert(PyUnicode_Check(unicode));
2010 assert(0 <= length);
2011
Victor Stinner910337b2011-10-03 03:20:16 +02002012 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002013 old_length = PyUnicode_WSTR_LENGTH(unicode);
2014 else
2015 old_length = PyUnicode_GET_LENGTH(unicode);
2016 if (old_length == length)
2017 return 0;
2018
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002019 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002020 _Py_INCREF_UNICODE_EMPTY();
2021 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00002022 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002023 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002024 return 0;
2025 }
2026
Victor Stinner488fa492011-12-12 00:01:39 +01002027 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002028 PyObject *copy = resize_copy(unicode, length);
2029 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002030 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002031 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002032 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002033 }
2034
Victor Stinnerfe226c02011-10-03 03:52:20 +02002035 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002036 PyObject *new_unicode = resize_compact(unicode, length);
2037 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002038 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002039 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002040 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002041 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002042 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002043}
2044
Alexander Belopolsky40018472011-02-26 01:02:56 +00002045int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002046PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002047{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002048 PyObject *unicode;
2049 if (p_unicode == NULL) {
2050 PyErr_BadInternalCall();
2051 return -1;
2052 }
2053 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002054 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002055 {
2056 PyErr_BadInternalCall();
2057 return -1;
2058 }
2059 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002060}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002061
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002062/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002063
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002064 WARNING: The function doesn't copy the terminating null character and
2065 doesn't check the maximum character (may write a latin1 character in an
2066 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002067static void
2068unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2069 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002070{
2071 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002072 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002073 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002074
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002075 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002076 switch (kind) {
2077 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002078#ifdef Py_DEBUG
2079 if (PyUnicode_IS_ASCII(unicode)) {
2080 Py_UCS4 maxchar = ucs1lib_find_max_char(
2081 (const Py_UCS1*)str,
2082 (const Py_UCS1*)str + len);
2083 assert(maxchar < 128);
2084 }
2085#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002086 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002087 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002088 }
2089 case PyUnicode_2BYTE_KIND: {
2090 Py_UCS2 *start = (Py_UCS2 *)data + index;
2091 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002092
Victor Stinner184252a2012-06-16 02:57:41 +02002093 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002094 *ucs2 = (Py_UCS2)*str;
2095
2096 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002097 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002098 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002099 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002100 Py_UCS4 *start = (Py_UCS4 *)data + index;
2101 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002102
Victor Stinner184252a2012-06-16 02:57:41 +02002103 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002104 *ucs4 = (Py_UCS4)*str;
2105
2106 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002107 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002108 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002109 default:
2110 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002111 }
2112}
2113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002114static PyObject*
2115get_latin1_char(unsigned char ch)
2116{
Victor Stinner607b1022020-05-05 18:50:30 +02002117 PyObject *unicode;
2118
2119#ifdef LATIN1_SINGLETONS
2120 unicode = unicode_latin1[ch];
2121 if (unicode) {
2122 Py_INCREF(unicode);
2123 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002124 }
Victor Stinner607b1022020-05-05 18:50:30 +02002125#endif
2126
2127 unicode = PyUnicode_New(1, ch);
2128 if (!unicode) {
2129 return NULL;
2130 }
2131
2132 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2133 assert(_PyUnicode_CheckConsistency(unicode, 1));
2134
2135#ifdef LATIN1_SINGLETONS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 Py_INCREF(unicode);
Victor Stinner607b1022020-05-05 18:50:30 +02002137 unicode_latin1[ch] = unicode;
2138#endif
Victor Stinnera464fc12011-10-02 20:39:30 +02002139 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140}
2141
Victor Stinner985a82a2014-01-03 12:53:47 +01002142static PyObject*
2143unicode_char(Py_UCS4 ch)
2144{
2145 PyObject *unicode;
2146
2147 assert(ch <= MAX_UNICODE);
2148
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002149 if (ch < 256)
2150 return get_latin1_char(ch);
2151
Victor Stinner985a82a2014-01-03 12:53:47 +01002152 unicode = PyUnicode_New(1, ch);
2153 if (unicode == NULL)
2154 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002155
2156 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2157 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002158 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002159 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002160 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2161 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2162 }
2163 assert(_PyUnicode_CheckConsistency(unicode, 1));
2164 return unicode;
2165}
2166
Alexander Belopolsky40018472011-02-26 01:02:56 +00002167PyObject *
2168PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002170 if (u == NULL)
2171 return (PyObject*)_PyUnicode_New(size);
2172
2173 if (size < 0) {
2174 PyErr_BadInternalCall();
2175 return NULL;
2176 }
2177
2178 return PyUnicode_FromWideChar(u, size);
2179}
2180
2181PyObject *
2182PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2183{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002184 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 Py_UCS4 maxchar = 0;
2186 Py_ssize_t num_surrogates;
2187
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002188 if (u == NULL && size != 0) {
2189 PyErr_BadInternalCall();
2190 return NULL;
2191 }
2192
2193 if (size == -1) {
2194 size = wcslen(u);
2195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002197 /* If the Unicode data is known at construction time, we can apply
2198 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002201 if (size == 0)
2202 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 /* Single character Unicode objects in the Latin-1 range are
2205 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002206 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 return get_latin1_char((unsigned char)*u);
2208
2209 /* If not empty and not single character, copy the Unicode data
2210 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002211 if (find_maxchar_surrogates(u, u + size,
2212 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 return NULL;
2214
Victor Stinner8faf8212011-12-08 22:14:11 +01002215 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216 if (!unicode)
2217 return NULL;
2218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 switch (PyUnicode_KIND(unicode)) {
2220 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002221 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2223 break;
2224 case PyUnicode_2BYTE_KIND:
2225#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002226 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002228 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2230#endif
2231 break;
2232 case PyUnicode_4BYTE_KIND:
2233#if SIZEOF_WCHAR_T == 2
2234 /* This is the only case which has to process surrogates, thus
2235 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002236 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237#else
2238 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002239 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240#endif
2241 break;
2242 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002243 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002246 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247}
2248
Alexander Belopolsky40018472011-02-26 01:02:56 +00002249PyObject *
2250PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002251{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002252 if (size < 0) {
2253 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002254 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002255 return NULL;
2256 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002257 if (u != NULL)
2258 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2259 else
2260 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002261}
2262
Alexander Belopolsky40018472011-02-26 01:02:56 +00002263PyObject *
2264PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002265{
2266 size_t size = strlen(u);
2267 if (size > PY_SSIZE_T_MAX) {
2268 PyErr_SetString(PyExc_OverflowError, "input too long");
2269 return NULL;
2270 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002271 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002272}
2273
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002274PyObject *
2275_PyUnicode_FromId(_Py_Identifier *id)
2276{
Victor Stinner297257f2020-06-02 14:39:45 +02002277 if (id->object) {
2278 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002279 }
Victor Stinner297257f2020-06-02 14:39:45 +02002280
2281 PyObject *obj;
2282 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2283 strlen(id->string),
2284 NULL, NULL);
2285 if (!obj) {
2286 return NULL;
2287 }
2288 PyUnicode_InternInPlace(&obj);
2289
2290 assert(!id->next);
2291 id->object = obj;
2292 id->next = static_strings;
2293 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002294 return id->object;
2295}
2296
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002297static void
2298unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002299{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002300 _Py_Identifier *tmp, *s = static_strings;
2301 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002302 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002303 tmp = s->next;
2304 s->next = NULL;
2305 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002306 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002307 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002308}
2309
Benjamin Peterson0df54292012-03-26 14:50:32 -04002310/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002311
Victor Stinnerd3f08822012-05-29 12:57:52 +02002312PyObject*
2313_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002314{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002315 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002316 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002317 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002318#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002319 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002320#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002321 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002322 }
Victor Stinner785938e2011-12-11 20:09:03 +01002323 unicode = PyUnicode_New(size, 127);
2324 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002325 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002326 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2327 assert(_PyUnicode_CheckConsistency(unicode, 1));
2328 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002329}
2330
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002331static Py_UCS4
2332kind_maxchar_limit(unsigned int kind)
2333{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002334 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002335 case PyUnicode_1BYTE_KIND:
2336 return 0x80;
2337 case PyUnicode_2BYTE_KIND:
2338 return 0x100;
2339 case PyUnicode_4BYTE_KIND:
2340 return 0x10000;
2341 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002342 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002343 }
2344}
2345
Victor Stinner702c7342011-10-05 13:50:52 +02002346static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002347_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002348{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002350 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002351
Serhiy Storchaka678db842013-01-26 12:16:36 +02002352 if (size == 0)
2353 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002354 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002355 if (size == 1)
2356 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002357
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002358 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002359 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 if (!res)
2361 return NULL;
2362 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002363 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002365}
2366
Victor Stinnere57b1c02011-09-28 22:20:48 +02002367static PyObject*
2368_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369{
2370 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002371 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002372
Serhiy Storchaka678db842013-01-26 12:16:36 +02002373 if (size == 0)
2374 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002375 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002376 if (size == 1)
2377 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002378
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002379 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002380 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381 if (!res)
2382 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002383 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002385 else {
2386 _PyUnicode_CONVERT_BYTES(
2387 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2388 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002389 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 return res;
2391}
2392
Victor Stinnere57b1c02011-09-28 22:20:48 +02002393static PyObject*
2394_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395{
2396 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002397 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002398
Serhiy Storchaka678db842013-01-26 12:16:36 +02002399 if (size == 0)
2400 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002401 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002402 if (size == 1)
2403 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002404
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002405 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002406 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407 if (!res)
2408 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002409 if (max_char < 256)
2410 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2411 PyUnicode_1BYTE_DATA(res));
2412 else if (max_char < 0x10000)
2413 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2414 PyUnicode_2BYTE_DATA(res));
2415 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002416 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002417 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 return res;
2419}
2420
2421PyObject*
2422PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2423{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002424 if (size < 0) {
2425 PyErr_SetString(PyExc_ValueError, "size must be positive");
2426 return NULL;
2427 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002428 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002430 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002431 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002432 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002434 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002435 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002436 PyErr_SetString(PyExc_SystemError, "invalid kind");
2437 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439}
2440
Victor Stinnerece58de2012-04-23 23:36:38 +02002441Py_UCS4
2442_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2443{
2444 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002445 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002446
2447 assert(PyUnicode_IS_READY(unicode));
2448 assert(0 <= start);
2449 assert(end <= PyUnicode_GET_LENGTH(unicode));
2450 assert(start <= end);
2451
2452 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2453 return PyUnicode_MAX_CHAR_VALUE(unicode);
2454
2455 if (start == end)
2456 return 127;
2457
Victor Stinner94d558b2012-04-27 22:26:58 +02002458 if (PyUnicode_IS_ASCII(unicode))
2459 return 127;
2460
Victor Stinnerece58de2012-04-23 23:36:38 +02002461 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002462 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002463 endptr = (char *)startptr + end * kind;
2464 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002465 switch(kind) {
2466 case PyUnicode_1BYTE_KIND:
2467 return ucs1lib_find_max_char(startptr, endptr);
2468 case PyUnicode_2BYTE_KIND:
2469 return ucs2lib_find_max_char(startptr, endptr);
2470 case PyUnicode_4BYTE_KIND:
2471 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002472 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002473 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002474 }
2475}
2476
Victor Stinner25a4b292011-10-06 12:31:55 +02002477/* Ensure that a string uses the most efficient storage, if it is not the
2478 case: create a new string with of the right kind. Write NULL into *p_unicode
2479 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002480static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002481unicode_adjust_maxchar(PyObject **p_unicode)
2482{
2483 PyObject *unicode, *copy;
2484 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002485 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002486 unsigned int kind;
2487
2488 assert(p_unicode != NULL);
2489 unicode = *p_unicode;
2490 assert(PyUnicode_IS_READY(unicode));
2491 if (PyUnicode_IS_ASCII(unicode))
2492 return;
2493
2494 len = PyUnicode_GET_LENGTH(unicode);
2495 kind = PyUnicode_KIND(unicode);
2496 if (kind == PyUnicode_1BYTE_KIND) {
2497 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002498 max_char = ucs1lib_find_max_char(u, u + len);
2499 if (max_char >= 128)
2500 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002501 }
2502 else if (kind == PyUnicode_2BYTE_KIND) {
2503 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002504 max_char = ucs2lib_find_max_char(u, u + len);
2505 if (max_char >= 256)
2506 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002507 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002508 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002509 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002510 max_char = ucs4lib_find_max_char(u, u + len);
2511 if (max_char >= 0x10000)
2512 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002513 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002514 else
2515 Py_UNREACHABLE();
2516
Victor Stinner25a4b292011-10-06 12:31:55 +02002517 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002518 if (copy != NULL)
2519 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002520 Py_DECREF(unicode);
2521 *p_unicode = copy;
2522}
2523
Victor Stinner034f6cf2011-09-30 02:26:44 +02002524PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002525_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002526{
Victor Stinner87af4f22011-11-21 23:03:47 +01002527 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002528 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002529
Victor Stinner034f6cf2011-09-30 02:26:44 +02002530 if (!PyUnicode_Check(unicode)) {
2531 PyErr_BadInternalCall();
2532 return NULL;
2533 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002534 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002535 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002536
Victor Stinner87af4f22011-11-21 23:03:47 +01002537 length = PyUnicode_GET_LENGTH(unicode);
2538 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002539 if (!copy)
2540 return NULL;
2541 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2542
Christian Heimesf051e432016-09-13 20:22:02 +02002543 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002544 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002545 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002546 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002547}
2548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002549
Victor Stinnerbc603d12011-10-02 01:00:40 +02002550/* Widen Unicode objects to larger buffers. Don't write terminating null
2551 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002553static void*
2554unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002555{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002556 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002557
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002558 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002559 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002560 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002561 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002562 if (!result)
2563 return PyErr_NoMemory();
2564 assert(skind == PyUnicode_1BYTE_KIND);
2565 _PyUnicode_CONVERT_BYTES(
2566 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002567 (const Py_UCS1 *)data,
2568 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002569 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002571 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002572 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002573 if (!result)
2574 return PyErr_NoMemory();
2575 if (skind == PyUnicode_2BYTE_KIND) {
2576 _PyUnicode_CONVERT_BYTES(
2577 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002578 (const Py_UCS2 *)data,
2579 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002580 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002582 else {
2583 assert(skind == PyUnicode_1BYTE_KIND);
2584 _PyUnicode_CONVERT_BYTES(
2585 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002586 (const Py_UCS1 *)data,
2587 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002588 result);
2589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002591 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002592 Py_UNREACHABLE();
2593 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595}
2596
2597static Py_UCS4*
2598as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2599 int copy_null)
2600{
2601 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002602 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 Py_ssize_t len, targetlen;
2604 if (PyUnicode_READY(string) == -1)
2605 return NULL;
2606 kind = PyUnicode_KIND(string);
2607 data = PyUnicode_DATA(string);
2608 len = PyUnicode_GET_LENGTH(string);
2609 targetlen = len;
2610 if (copy_null)
2611 targetlen++;
2612 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002613 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 if (!target) {
2615 PyErr_NoMemory();
2616 return NULL;
2617 }
2618 }
2619 else {
2620 if (targetsize < targetlen) {
2621 PyErr_Format(PyExc_SystemError,
2622 "string is longer than the buffer");
2623 if (copy_null && 0 < targetsize)
2624 target[0] = 0;
2625 return NULL;
2626 }
2627 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002628 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002629 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002630 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002632 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002633 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002634 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2635 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002636 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002637 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002638 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002639 else {
2640 Py_UNREACHABLE();
2641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 if (copy_null)
2643 target[len] = 0;
2644 return target;
2645}
2646
2647Py_UCS4*
2648PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2649 int copy_null)
2650{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002651 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 PyErr_BadInternalCall();
2653 return NULL;
2654 }
2655 return as_ucs4(string, target, targetsize, copy_null);
2656}
2657
2658Py_UCS4*
2659PyUnicode_AsUCS4Copy(PyObject *string)
2660{
2661 return as_ucs4(string, NULL, 0, 1);
2662}
2663
Victor Stinner15a11362012-10-06 23:48:20 +02002664/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002665 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2666 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2667#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002668
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002669static int
2670unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2671 Py_ssize_t width, Py_ssize_t precision)
2672{
2673 Py_ssize_t length, fill, arglen;
2674 Py_UCS4 maxchar;
2675
2676 if (PyUnicode_READY(str) == -1)
2677 return -1;
2678
2679 length = PyUnicode_GET_LENGTH(str);
2680 if ((precision == -1 || precision >= length)
2681 && width <= length)
2682 return _PyUnicodeWriter_WriteStr(writer, str);
2683
2684 if (precision != -1)
2685 length = Py_MIN(precision, length);
2686
2687 arglen = Py_MAX(length, width);
2688 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2689 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2690 else
2691 maxchar = writer->maxchar;
2692
2693 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2694 return -1;
2695
2696 if (width > length) {
2697 fill = width - length;
2698 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2699 return -1;
2700 writer->pos += fill;
2701 }
2702
2703 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2704 str, 0, length);
2705 writer->pos += length;
2706 return 0;
2707}
2708
2709static int
Victor Stinner998b8062018-09-12 00:23:25 +02002710unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002711 Py_ssize_t width, Py_ssize_t precision)
2712{
2713 /* UTF-8 */
2714 Py_ssize_t length;
2715 PyObject *unicode;
2716 int res;
2717
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002718 if (precision == -1) {
2719 length = strlen(str);
2720 }
2721 else {
2722 length = 0;
2723 while (length < precision && str[length]) {
2724 length++;
2725 }
2726 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002727 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2728 if (unicode == NULL)
2729 return -1;
2730
2731 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2732 Py_DECREF(unicode);
2733 return res;
2734}
2735
Victor Stinner96865452011-03-01 23:44:09 +00002736static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002737unicode_fromformat_arg(_PyUnicodeWriter *writer,
2738 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002739{
Victor Stinnere215d962012-10-06 23:03:36 +02002740 const char *p;
2741 Py_ssize_t len;
2742 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002743 Py_ssize_t width;
2744 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002745 int longflag;
2746 int longlongflag;
2747 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002748 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002749
2750 p = f;
2751 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002752 zeropad = 0;
2753 if (*f == '0') {
2754 zeropad = 1;
2755 f++;
2756 }
Victor Stinner96865452011-03-01 23:44:09 +00002757
2758 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002759 width = -1;
2760 if (Py_ISDIGIT((unsigned)*f)) {
2761 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002762 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002763 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002764 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002765 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002766 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002767 return NULL;
2768 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002769 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002770 f++;
2771 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002772 }
2773 precision = -1;
2774 if (*f == '.') {
2775 f++;
2776 if (Py_ISDIGIT((unsigned)*f)) {
2777 precision = (*f - '0');
2778 f++;
2779 while (Py_ISDIGIT((unsigned)*f)) {
2780 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2781 PyErr_SetString(PyExc_ValueError,
2782 "precision too big");
2783 return NULL;
2784 }
2785 precision = (precision * 10) + (*f - '0');
2786 f++;
2787 }
2788 }
Victor Stinner96865452011-03-01 23:44:09 +00002789 if (*f == '%') {
2790 /* "%.3%s" => f points to "3" */
2791 f--;
2792 }
2793 }
2794 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002795 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002796 f--;
2797 }
Victor Stinner96865452011-03-01 23:44:09 +00002798
2799 /* Handle %ld, %lu, %lld and %llu. */
2800 longflag = 0;
2801 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002802 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002803 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002804 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002805 longflag = 1;
2806 ++f;
2807 }
Victor Stinner96865452011-03-01 23:44:09 +00002808 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002809 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002810 longlongflag = 1;
2811 f += 2;
2812 }
Victor Stinner96865452011-03-01 23:44:09 +00002813 }
2814 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002815 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002816 size_tflag = 1;
2817 ++f;
2818 }
Victor Stinnere215d962012-10-06 23:03:36 +02002819
2820 if (f[1] == '\0')
2821 writer->overallocate = 0;
2822
2823 switch (*f) {
2824 case 'c':
2825 {
2826 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002827 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002828 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002829 "character argument not in range(0x110000)");
2830 return NULL;
2831 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002832 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002833 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002834 break;
2835 }
2836
2837 case 'i':
2838 case 'd':
2839 case 'u':
2840 case 'x':
2841 {
2842 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002843 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002844 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002845
2846 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002847 if (longflag) {
2848 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2849 }
2850 else if (longlongflag) {
2851 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2852 }
2853 else if (size_tflag) {
2854 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2855 }
2856 else {
2857 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2858 }
Victor Stinnere215d962012-10-06 23:03:36 +02002859 }
2860 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002861 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002862 }
2863 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002864 if (longflag) {
2865 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2866 }
2867 else if (longlongflag) {
2868 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2869 }
2870 else if (size_tflag) {
2871 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2872 }
2873 else {
2874 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2875 }
Victor Stinnere215d962012-10-06 23:03:36 +02002876 }
2877 assert(len >= 0);
2878
Victor Stinnere215d962012-10-06 23:03:36 +02002879 if (precision < len)
2880 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002881
2882 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002883 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2884 return NULL;
2885
Victor Stinnere215d962012-10-06 23:03:36 +02002886 if (width > precision) {
2887 Py_UCS4 fillchar;
2888 fill = width - precision;
2889 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002890 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2891 return NULL;
2892 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002893 }
Victor Stinner15a11362012-10-06 23:48:20 +02002894 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002895 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002896 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2897 return NULL;
2898 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002899 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002900
Victor Stinner4a587072013-11-19 12:54:53 +01002901 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2902 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002903 break;
2904 }
2905
2906 case 'p':
2907 {
2908 char number[MAX_LONG_LONG_CHARS];
2909
2910 len = sprintf(number, "%p", va_arg(*vargs, void*));
2911 assert(len >= 0);
2912
2913 /* %p is ill-defined: ensure leading 0x. */
2914 if (number[1] == 'X')
2915 number[1] = 'x';
2916 else if (number[1] != 'x') {
2917 memmove(number + 2, number,
2918 strlen(number) + 1);
2919 number[0] = '0';
2920 number[1] = 'x';
2921 len += 2;
2922 }
2923
Victor Stinner4a587072013-11-19 12:54:53 +01002924 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002925 return NULL;
2926 break;
2927 }
2928
2929 case 's':
2930 {
2931 /* UTF-8 */
2932 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002933 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002934 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002935 break;
2936 }
2937
2938 case 'U':
2939 {
2940 PyObject *obj = va_arg(*vargs, PyObject *);
2941 assert(obj && _PyUnicode_CHECK(obj));
2942
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002943 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002944 return NULL;
2945 break;
2946 }
2947
2948 case 'V':
2949 {
2950 PyObject *obj = va_arg(*vargs, PyObject *);
2951 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002952 if (obj) {
2953 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002954 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002955 return NULL;
2956 }
2957 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002958 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002959 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002960 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002961 }
2962 break;
2963 }
2964
2965 case 'S':
2966 {
2967 PyObject *obj = va_arg(*vargs, PyObject *);
2968 PyObject *str;
2969 assert(obj);
2970 str = PyObject_Str(obj);
2971 if (!str)
2972 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002973 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002974 Py_DECREF(str);
2975 return NULL;
2976 }
2977 Py_DECREF(str);
2978 break;
2979 }
2980
2981 case 'R':
2982 {
2983 PyObject *obj = va_arg(*vargs, PyObject *);
2984 PyObject *repr;
2985 assert(obj);
2986 repr = PyObject_Repr(obj);
2987 if (!repr)
2988 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002989 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002990 Py_DECREF(repr);
2991 return NULL;
2992 }
2993 Py_DECREF(repr);
2994 break;
2995 }
2996
2997 case 'A':
2998 {
2999 PyObject *obj = va_arg(*vargs, PyObject *);
3000 PyObject *ascii;
3001 assert(obj);
3002 ascii = PyObject_ASCII(obj);
3003 if (!ascii)
3004 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003005 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003006 Py_DECREF(ascii);
3007 return NULL;
3008 }
3009 Py_DECREF(ascii);
3010 break;
3011 }
3012
3013 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003014 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003015 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003016 break;
3017
3018 default:
3019 /* if we stumble upon an unknown formatting code, copy the rest
3020 of the format string to the output string. (we cannot just
3021 skip the code, since there's no way to know what's in the
3022 argument list) */
3023 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003024 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003025 return NULL;
3026 f = p+len;
3027 return f;
3028 }
3029
3030 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003031 return f;
3032}
3033
Walter Dörwaldd2034312007-05-18 16:29:38 +00003034PyObject *
3035PyUnicode_FromFormatV(const char *format, va_list vargs)
3036{
Victor Stinnere215d962012-10-06 23:03:36 +02003037 va_list vargs2;
3038 const char *f;
3039 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003040
Victor Stinner8f674cc2013-04-17 23:02:17 +02003041 _PyUnicodeWriter_Init(&writer);
3042 writer.min_length = strlen(format) + 100;
3043 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003044
Benjamin Peterson0c212142016-09-20 20:39:33 -07003045 // Copy varags to be able to pass a reference to a subfunction.
3046 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003047
3048 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003049 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003050 f = unicode_fromformat_arg(&writer, f, &vargs2);
3051 if (f == NULL)
3052 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003054 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003055 const char *p;
3056 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003057
Victor Stinnere215d962012-10-06 23:03:36 +02003058 p = f;
3059 do
3060 {
3061 if ((unsigned char)*p > 127) {
3062 PyErr_Format(PyExc_ValueError,
3063 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3064 "string, got a non-ASCII byte: 0x%02x",
3065 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003066 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003067 }
3068 p++;
3069 }
3070 while (*p != '\0' && *p != '%');
3071 len = p - f;
3072
3073 if (*p == '\0')
3074 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003075
3076 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003077 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003078
3079 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003081 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003082 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003083 return _PyUnicodeWriter_Finish(&writer);
3084
3085 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003086 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003087 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003088 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003089}
3090
Walter Dörwaldd2034312007-05-18 16:29:38 +00003091PyObject *
3092PyUnicode_FromFormat(const char *format, ...)
3093{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003094 PyObject* ret;
3095 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003096
3097#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003098 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003099#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003100 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003101#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003102 ret = PyUnicode_FromFormatV(format, vargs);
3103 va_end(vargs);
3104 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003105}
3106
Serhiy Storchakac46db922018-10-23 22:58:24 +03003107static Py_ssize_t
3108unicode_get_widechar_size(PyObject *unicode)
3109{
3110 Py_ssize_t res;
3111
3112 assert(unicode != NULL);
3113 assert(_PyUnicode_CHECK(unicode));
3114
3115 if (_PyUnicode_WSTR(unicode) != NULL) {
3116 return PyUnicode_WSTR_LENGTH(unicode);
3117 }
3118 assert(PyUnicode_IS_READY(unicode));
3119
3120 res = _PyUnicode_LENGTH(unicode);
3121#if SIZEOF_WCHAR_T == 2
3122 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3123 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3124 const Py_UCS4 *end = s + res;
3125 for (; s < end; ++s) {
3126 if (*s > 0xFFFF) {
3127 ++res;
3128 }
3129 }
3130 }
3131#endif
3132 return res;
3133}
3134
3135static void
3136unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3137{
3138 const wchar_t *wstr;
3139
3140 assert(unicode != NULL);
3141 assert(_PyUnicode_CHECK(unicode));
3142
3143 wstr = _PyUnicode_WSTR(unicode);
3144 if (wstr != NULL) {
3145 memcpy(w, wstr, size * sizeof(wchar_t));
3146 return;
3147 }
3148 assert(PyUnicode_IS_READY(unicode));
3149
3150 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3151 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3152 for (; size--; ++s, ++w) {
3153 *w = *s;
3154 }
3155 }
3156 else {
3157#if SIZEOF_WCHAR_T == 4
3158 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3159 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3160 for (; size--; ++s, ++w) {
3161 *w = *s;
3162 }
3163#else
3164 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3165 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3166 for (; size--; ++s, ++w) {
3167 Py_UCS4 ch = *s;
3168 if (ch > 0xFFFF) {
3169 assert(ch <= MAX_UNICODE);
3170 /* encode surrogate pair in this case */
3171 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3172 if (!size--)
3173 break;
3174 *w = Py_UNICODE_LOW_SURROGATE(ch);
3175 }
3176 else {
3177 *w = ch;
3178 }
3179 }
3180#endif
3181 }
3182}
3183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003184#ifdef HAVE_WCHAR_H
3185
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003186/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003187
Victor Stinnerd88d9832011-09-06 02:00:05 +02003188 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003189 character) required to convert the unicode object. Ignore size argument.
3190
Victor Stinnerd88d9832011-09-06 02:00:05 +02003191 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003192 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003193 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003194Py_ssize_t
3195PyUnicode_AsWideChar(PyObject *unicode,
3196 wchar_t *w,
3197 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003198{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003199 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003200
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003201 if (unicode == NULL) {
3202 PyErr_BadInternalCall();
3203 return -1;
3204 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003205 if (!PyUnicode_Check(unicode)) {
3206 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003207 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003208 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003209
3210 res = unicode_get_widechar_size(unicode);
3211 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003212 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003213 }
3214
3215 if (size > res) {
3216 size = res + 1;
3217 }
3218 else {
3219 res = size;
3220 }
3221 unicode_copy_as_widechar(unicode, w, size);
3222 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003223}
3224
Victor Stinner137c34c2010-09-29 10:25:54 +00003225wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003226PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003227 Py_ssize_t *size)
3228{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003229 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003230 Py_ssize_t buflen;
3231
3232 if (unicode == NULL) {
3233 PyErr_BadInternalCall();
3234 return NULL;
3235 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003236 if (!PyUnicode_Check(unicode)) {
3237 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003238 return NULL;
3239 }
3240
Serhiy Storchakac46db922018-10-23 22:58:24 +03003241 buflen = unicode_get_widechar_size(unicode);
3242 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003243 if (buffer == NULL) {
3244 PyErr_NoMemory();
3245 return NULL;
3246 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003247 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3248 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003249 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003250 }
3251 else if (wcslen(buffer) != (size_t)buflen) {
3252 PyMem_FREE(buffer);
3253 PyErr_SetString(PyExc_ValueError,
3254 "embedded null character");
3255 return NULL;
3256 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003257 return buffer;
3258}
3259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003260#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261
Alexander Belopolsky40018472011-02-26 01:02:56 +00003262PyObject *
3263PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003264{
Victor Stinner8faf8212011-12-08 22:14:11 +01003265 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003266 PyErr_SetString(PyExc_ValueError,
3267 "chr() arg not in range(0x110000)");
3268 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003269 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003270
Victor Stinner985a82a2014-01-03 12:53:47 +01003271 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003272}
3273
Alexander Belopolsky40018472011-02-26 01:02:56 +00003274PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003275PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003277 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003278 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003279 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003280 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003281 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003282 Py_INCREF(obj);
3283 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003284 }
3285 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003286 /* For a Unicode subtype that's not a Unicode object,
3287 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003288 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003289 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003290 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003291 "Can't convert '%.100s' object to str implicitly",
3292 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003293 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003294}
3295
Alexander Belopolsky40018472011-02-26 01:02:56 +00003296PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003297PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003298 const char *encoding,
3299 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003300{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003301 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003302 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003303
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003305 PyErr_BadInternalCall();
3306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003308
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003309 /* Decoding bytes objects is the most common case and should be fast */
3310 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003311 if (PyBytes_GET_SIZE(obj) == 0) {
3312 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3313 return NULL;
3314 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003315 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003316 }
3317 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003318 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3319 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003320 }
3321
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003322 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003323 PyErr_SetString(PyExc_TypeError,
3324 "decoding str is not supported");
3325 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003326 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003327
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003328 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3329 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3330 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003331 "decoding to str: need a bytes-like object, %.80s found",
3332 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003333 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003334 }
Tim Petersced69f82003-09-16 20:30:58 +00003335
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003336 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003337 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003338 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3339 return NULL;
3340 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003341 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003343
Serhiy Storchaka05997252013-01-26 12:14:02 +02003344 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003345 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003346 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347}
3348
Victor Stinnerebe17e02016-10-12 13:57:45 +02003349/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3350 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3351 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003352int
3353_Py_normalize_encoding(const char *encoding,
3354 char *lower,
3355 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003357 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003358 char *l;
3359 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003360 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003361
Victor Stinner942889a2016-09-05 15:40:10 -07003362 assert(encoding != NULL);
3363
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003364 e = encoding;
3365 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003366 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003367 punct = 0;
3368 while (1) {
3369 char c = *e;
3370 if (c == 0) {
3371 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003372 }
Victor Stinner942889a2016-09-05 15:40:10 -07003373
3374 if (Py_ISALNUM(c) || c == '.') {
3375 if (punct && l != lower) {
3376 if (l == l_end) {
3377 return 0;
3378 }
3379 *l++ = '_';
3380 }
3381 punct = 0;
3382
3383 if (l == l_end) {
3384 return 0;
3385 }
3386 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003387 }
3388 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003389 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003390 }
Victor Stinner942889a2016-09-05 15:40:10 -07003391
3392 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003393 }
3394 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003395 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003396}
3397
Alexander Belopolsky40018472011-02-26 01:02:56 +00003398PyObject *
3399PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003400 Py_ssize_t size,
3401 const char *encoding,
3402 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003403{
3404 PyObject *buffer = NULL, *unicode;
3405 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003406 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3407
Victor Stinner22eb6892019-06-26 00:51:05 +02003408 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3409 return NULL;
3410 }
3411
Victor Stinnered076ed2019-06-26 01:49:32 +02003412 if (size == 0) {
3413 _Py_RETURN_UNICODE_EMPTY();
3414 }
3415
Victor Stinner942889a2016-09-05 15:40:10 -07003416 if (encoding == NULL) {
3417 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3418 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003419
Fred Drakee4315f52000-05-09 19:53:39 +00003420 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003421 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3422 char *lower = buflower;
3423
3424 /* Fast paths */
3425 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3426 lower += 3;
3427 if (*lower == '_') {
3428 /* Match "utf8" and "utf_8" */
3429 lower++;
3430 }
3431
3432 if (lower[0] == '8' && lower[1] == 0) {
3433 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3434 }
3435 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3436 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3437 }
3438 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3439 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3440 }
3441 }
3442 else {
3443 if (strcmp(lower, "ascii") == 0
3444 || strcmp(lower, "us_ascii") == 0) {
3445 return PyUnicode_DecodeASCII(s, size, errors);
3446 }
Steve Dowercc16be82016-09-08 10:35:16 -07003447 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003448 else if (strcmp(lower, "mbcs") == 0) {
3449 return PyUnicode_DecodeMBCS(s, size, errors);
3450 }
3451 #endif
3452 else if (strcmp(lower, "latin1") == 0
3453 || strcmp(lower, "latin_1") == 0
3454 || strcmp(lower, "iso_8859_1") == 0
3455 || strcmp(lower, "iso8859_1") == 0) {
3456 return PyUnicode_DecodeLatin1(s, size, errors);
3457 }
3458 }
Victor Stinner37296e82010-06-10 13:36:23 +00003459 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460
3461 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003462 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003463 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003464 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003465 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 if (buffer == NULL)
3467 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003468 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 if (unicode == NULL)
3470 goto onError;
3471 if (!PyUnicode_Check(unicode)) {
3472 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003473 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003474 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003475 encoding,
3476 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003477 Py_DECREF(unicode);
3478 goto onError;
3479 }
3480 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003481 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003482
Benjamin Peterson29060642009-01-31 22:14:21 +00003483 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 Py_XDECREF(buffer);
3485 return NULL;
3486}
3487
Alexander Belopolsky40018472011-02-26 01:02:56 +00003488PyObject *
3489PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003490 const char *encoding,
3491 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003492{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003493 if (!PyUnicode_Check(unicode)) {
3494 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003495 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003496 }
3497
Serhiy Storchaka00939072016-10-27 21:05:49 +03003498 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3499 "PyUnicode_AsDecodedObject() is deprecated; "
3500 "use PyCodec_Decode() to decode from str", 1) < 0)
3501 return NULL;
3502
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003503 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003505
3506 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003507 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003508}
3509
Alexander Belopolsky40018472011-02-26 01:02:56 +00003510PyObject *
3511PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003512 const char *encoding,
3513 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003514{
3515 PyObject *v;
3516
3517 if (!PyUnicode_Check(unicode)) {
3518 PyErr_BadArgument();
3519 goto onError;
3520 }
3521
Serhiy Storchaka00939072016-10-27 21:05:49 +03003522 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3523 "PyUnicode_AsDecodedUnicode() is deprecated; "
3524 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3525 return NULL;
3526
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003527 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003528 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003529
3530 /* Decode via the codec registry */
3531 v = PyCodec_Decode(unicode, encoding, errors);
3532 if (v == NULL)
3533 goto onError;
3534 if (!PyUnicode_Check(v)) {
3535 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003536 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003537 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003538 encoding,
3539 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003540 Py_DECREF(v);
3541 goto onError;
3542 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003543 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003544
Benjamin Peterson29060642009-01-31 22:14:21 +00003545 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003546 return NULL;
3547}
3548
Alexander Belopolsky40018472011-02-26 01:02:56 +00003549PyObject *
3550PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003551 Py_ssize_t size,
3552 const char *encoding,
3553 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554{
3555 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003556
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003557 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003559 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3561 Py_DECREF(unicode);
3562 return v;
3563}
3564
Alexander Belopolsky40018472011-02-26 01:02:56 +00003565PyObject *
3566PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003567 const char *encoding,
3568 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003569{
3570 PyObject *v;
3571
3572 if (!PyUnicode_Check(unicode)) {
3573 PyErr_BadArgument();
3574 goto onError;
3575 }
3576
Serhiy Storchaka00939072016-10-27 21:05:49 +03003577 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3578 "PyUnicode_AsEncodedObject() is deprecated; "
3579 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3580 "or PyCodec_Encode() for generic encoding", 1) < 0)
3581 return NULL;
3582
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003583 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003584 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003585
3586 /* Encode via the codec registry */
3587 v = PyCodec_Encode(unicode, encoding, errors);
3588 if (v == NULL)
3589 goto onError;
3590 return v;
3591
Benjamin Peterson29060642009-01-31 22:14:21 +00003592 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003593 return NULL;
3594}
3595
Victor Stinner1b579672011-12-17 05:47:23 +01003596
Victor Stinner2cba6b82018-01-10 22:46:15 +01003597static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003598unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003599 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003600{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003601 Py_ssize_t wlen;
3602 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3603 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003604 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003605 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003606
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003607 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003608 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003609 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003610 return NULL;
3611 }
3612
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003613 char *str;
3614 size_t error_pos;
3615 const char *reason;
3616 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003617 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003618 PyMem_Free(wstr);
3619
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003620 if (res != 0) {
3621 if (res == -2) {
3622 PyObject *exc;
3623 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3624 "locale", unicode,
3625 (Py_ssize_t)error_pos,
3626 (Py_ssize_t)(error_pos+1),
3627 reason);
3628 if (exc != NULL) {
3629 PyCodec_StrictErrors(exc);
3630 Py_DECREF(exc);
3631 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003632 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003633 else if (res == -3) {
3634 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3635 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003636 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003637 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003638 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003639 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003640 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003641
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003642 PyObject *bytes = PyBytes_FromString(str);
3643 PyMem_RawFree(str);
3644 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003645}
3646
Victor Stinnerad158722010-10-27 00:25:46 +00003647PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003648PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3649{
Victor Stinner709d23d2019-05-02 14:56:30 -04003650 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3651 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003652}
3653
3654PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003655PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003656{
Victor Stinner81a7be32020-04-14 15:14:01 +02003657 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003658 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3659 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003660 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003661 fs_codec->error_handler,
3662 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003663 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003664#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003665 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003666 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003667 fs_codec->encoding,
3668 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003669 }
Victor Stinnerad158722010-10-27 00:25:46 +00003670#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003671 else {
3672 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3673 machinery is not ready and so cannot be used:
3674 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003675 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3676 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003677 assert(filesystem_errors != NULL);
3678 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3679 assert(errors != _Py_ERROR_UNKNOWN);
3680#ifdef _Py_FORCE_UTF8_FS_ENCODING
3681 return unicode_encode_utf8(unicode, errors, NULL);
3682#else
3683 return unicode_encode_locale(unicode, errors, 0);
3684#endif
3685 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003686}
3687
Alexander Belopolsky40018472011-02-26 01:02:56 +00003688PyObject *
3689PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003690 const char *encoding,
3691 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692{
3693 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003694 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003695
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 if (!PyUnicode_Check(unicode)) {
3697 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 }
Fred Drakee4315f52000-05-09 19:53:39 +00003700
Victor Stinner22eb6892019-06-26 00:51:05 +02003701 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3702 return NULL;
3703 }
3704
Victor Stinner942889a2016-09-05 15:40:10 -07003705 if (encoding == NULL) {
3706 return _PyUnicode_AsUTF8String(unicode, errors);
3707 }
3708
Fred Drakee4315f52000-05-09 19:53:39 +00003709 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003710 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3711 char *lower = buflower;
3712
3713 /* Fast paths */
3714 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3715 lower += 3;
3716 if (*lower == '_') {
3717 /* Match "utf8" and "utf_8" */
3718 lower++;
3719 }
3720
3721 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003722 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003723 }
3724 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3725 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3726 }
3727 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3728 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3729 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003730 }
Victor Stinner942889a2016-09-05 15:40:10 -07003731 else {
3732 if (strcmp(lower, "ascii") == 0
3733 || strcmp(lower, "us_ascii") == 0) {
3734 return _PyUnicode_AsASCIIString(unicode, errors);
3735 }
Steve Dowercc16be82016-09-08 10:35:16 -07003736#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003737 else if (strcmp(lower, "mbcs") == 0) {
3738 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3739 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003740#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003741 else if (strcmp(lower, "latin1") == 0 ||
3742 strcmp(lower, "latin_1") == 0 ||
3743 strcmp(lower, "iso_8859_1") == 0 ||
3744 strcmp(lower, "iso8859_1") == 0) {
3745 return _PyUnicode_AsLatin1String(unicode, errors);
3746 }
3747 }
Victor Stinner37296e82010-06-10 13:36:23 +00003748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749
3750 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003751 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003753 return NULL;
3754
3755 /* The normal path */
3756 if (PyBytes_Check(v))
3757 return v;
3758
3759 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003760 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003761 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003762 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003763
3764 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003765 "encoder %s returned bytearray instead of bytes; "
3766 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003767 encoding);
3768 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003769 Py_DECREF(v);
3770 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003771 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003772
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003773 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3774 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003775 Py_DECREF(v);
3776 return b;
3777 }
3778
3779 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003780 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003781 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003782 encoding,
3783 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003784 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003785 return NULL;
3786}
3787
Alexander Belopolsky40018472011-02-26 01:02:56 +00003788PyObject *
3789PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003790 const char *encoding,
3791 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003792{
3793 PyObject *v;
3794
3795 if (!PyUnicode_Check(unicode)) {
3796 PyErr_BadArgument();
3797 goto onError;
3798 }
3799
Serhiy Storchaka00939072016-10-27 21:05:49 +03003800 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3801 "PyUnicode_AsEncodedUnicode() is deprecated; "
3802 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3803 return NULL;
3804
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003805 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003806 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003807
3808 /* Encode via the codec registry */
3809 v = PyCodec_Encode(unicode, encoding, errors);
3810 if (v == NULL)
3811 goto onError;
3812 if (!PyUnicode_Check(v)) {
3813 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003814 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003815 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003816 encoding,
3817 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003818 Py_DECREF(v);
3819 goto onError;
3820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003822
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 return NULL;
3825}
3826
Victor Stinner2cba6b82018-01-10 22:46:15 +01003827static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003828unicode_decode_locale(const char *str, Py_ssize_t len,
3829 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003830{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003831 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3832 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003833 return NULL;
3834 }
3835
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003836 wchar_t *wstr;
3837 size_t wlen;
3838 const char *reason;
3839 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003840 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003841 if (res != 0) {
3842 if (res == -2) {
3843 PyObject *exc;
3844 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3845 "locale", str, len,
3846 (Py_ssize_t)wlen,
3847 (Py_ssize_t)(wlen + 1),
3848 reason);
3849 if (exc != NULL) {
3850 PyCodec_StrictErrors(exc);
3851 Py_DECREF(exc);
3852 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003853 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003854 else if (res == -3) {
3855 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3856 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003857 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003858 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003859 }
Victor Stinner2f197072011-12-17 07:08:30 +01003860 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003861 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003862
3863 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3864 PyMem_RawFree(wstr);
3865 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003866}
3867
3868PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003869PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3870 const char *errors)
3871{
Victor Stinner709d23d2019-05-02 14:56:30 -04003872 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3873 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003874}
3875
3876PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003877PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003878{
3879 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003880 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3881 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003882}
3883
3884
3885PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003886PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003887 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003888 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3889}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003890
Christian Heimes5894ba72007-11-04 11:43:14 +00003891PyObject*
3892PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3893{
Victor Stinner81a7be32020-04-14 15:14:01 +02003894 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003895 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3896 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003897 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003898 fs_codec->error_handler,
3899 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04003900 NULL);
3901 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003902#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003903 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003904 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003905 fs_codec->encoding,
3906 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003907 }
Victor Stinnerad158722010-10-27 00:25:46 +00003908#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003909 else {
3910 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3911 machinery is not ready and so cannot be used:
3912 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003913 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3914 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003915 assert(filesystem_errors != NULL);
3916 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3917 assert(errors != _Py_ERROR_UNKNOWN);
3918#ifdef _Py_FORCE_UTF8_FS_ENCODING
3919 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3920#else
3921 return unicode_decode_locale(s, size, errors, 0);
3922#endif
3923 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003924}
3925
Martin v. Löwis011e8422009-05-05 04:43:17 +00003926
3927int
3928PyUnicode_FSConverter(PyObject* arg, void* addr)
3929{
Brett Cannonec6ce872016-09-06 15:50:29 -07003930 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003931 PyObject *output = NULL;
3932 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003933 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003934 if (arg == NULL) {
3935 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003936 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003937 return 1;
3938 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003939 path = PyOS_FSPath(arg);
3940 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003941 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003942 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003943 if (PyBytes_Check(path)) {
3944 output = path;
3945 }
3946 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3947 output = PyUnicode_EncodeFSDefault(path);
3948 Py_DECREF(path);
3949 if (!output) {
3950 return 0;
3951 }
3952 assert(PyBytes_Check(output));
3953 }
3954
Victor Stinner0ea2a462010-04-30 00:22:08 +00003955 size = PyBytes_GET_SIZE(output);
3956 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003957 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003958 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003959 Py_DECREF(output);
3960 return 0;
3961 }
3962 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003963 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003964}
3965
3966
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003967int
3968PyUnicode_FSDecoder(PyObject* arg, void* addr)
3969{
Brett Cannona5711202016-09-06 19:36:01 -07003970 int is_buffer = 0;
3971 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003972 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003973 if (arg == NULL) {
3974 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003975 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003976 return 1;
3977 }
Brett Cannona5711202016-09-06 19:36:01 -07003978
3979 is_buffer = PyObject_CheckBuffer(arg);
3980 if (!is_buffer) {
3981 path = PyOS_FSPath(arg);
3982 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003983 return 0;
3984 }
Brett Cannona5711202016-09-06 19:36:01 -07003985 }
3986 else {
3987 path = arg;
3988 Py_INCREF(arg);
3989 }
3990
3991 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003992 output = path;
3993 }
3994 else if (PyBytes_Check(path) || is_buffer) {
3995 PyObject *path_bytes = NULL;
3996
3997 if (!PyBytes_Check(path) &&
3998 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003999 "path should be string, bytes, or os.PathLike, not %.200s",
4000 Py_TYPE(arg)->tp_name)) {
4001 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004002 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004003 }
4004 path_bytes = PyBytes_FromObject(path);
4005 Py_DECREF(path);
4006 if (!path_bytes) {
4007 return 0;
4008 }
4009 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4010 PyBytes_GET_SIZE(path_bytes));
4011 Py_DECREF(path_bytes);
4012 if (!output) {
4013 return 0;
4014 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004015 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004016 else {
4017 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004018 "path should be string, bytes, or os.PathLike, not %.200s",
4019 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004020 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004021 return 0;
4022 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004023 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004024 Py_DECREF(output);
4025 return 0;
4026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004028 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004029 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004030 Py_DECREF(output);
4031 return 0;
4032 }
4033 *(PyObject**)addr = output;
4034 return Py_CLEANUP_SUPPORTED;
4035}
4036
4037
Inada Naoki02a4d572020-02-27 13:48:59 +09004038static int unicode_fill_utf8(PyObject *unicode);
4039
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004040const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004042{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004043 if (!PyUnicode_Check(unicode)) {
4044 PyErr_BadArgument();
4045 return NULL;
4046 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004047 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004048 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004050 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004051 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 return NULL;
4053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 }
4055
4056 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004057 *psize = PyUnicode_UTF8_LENGTH(unicode);
4058 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004059}
4060
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004061const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4065}
4066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067Py_UNICODE *
4068PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004070 if (!PyUnicode_Check(unicode)) {
4071 PyErr_BadArgument();
4072 return NULL;
4073 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004074 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4075 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004077 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004078 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004079
Serhiy Storchakac46db922018-10-23 22:58:24 +03004080 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4081 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4082 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004085 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4086 if (w == NULL) {
4087 PyErr_NoMemory();
4088 return NULL;
4089 }
4090 unicode_copy_as_widechar(unicode, w, wlen + 1);
4091 _PyUnicode_WSTR(unicode) = w;
4092 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4093 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004094 }
4095 }
4096 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004097 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004098 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004099}
4100
Alexander Belopolsky40018472011-02-26 01:02:56 +00004101Py_UNICODE *
4102PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004104 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105}
4106
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004107const Py_UNICODE *
4108_PyUnicode_AsUnicode(PyObject *unicode)
4109{
4110 Py_ssize_t size;
4111 const Py_UNICODE *wstr;
4112
4113 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4114 if (wstr && wcslen(wstr) != (size_t)size) {
4115 PyErr_SetString(PyExc_ValueError, "embedded null character");
4116 return NULL;
4117 }
4118 return wstr;
4119}
4120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004121
Alexander Belopolsky40018472011-02-26 01:02:56 +00004122Py_ssize_t
4123PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124{
4125 if (!PyUnicode_Check(unicode)) {
4126 PyErr_BadArgument();
4127 goto onError;
4128 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004129 if (_PyUnicode_WSTR(unicode) == NULL) {
4130 if (PyUnicode_AsUnicode(unicode) == NULL)
4131 goto onError;
4132 }
4133 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 return -1;
4137}
4138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139Py_ssize_t
4140PyUnicode_GetLength(PyObject *unicode)
4141{
Victor Stinner07621332012-06-16 04:53:46 +02004142 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004143 PyErr_BadArgument();
4144 return -1;
4145 }
Victor Stinner07621332012-06-16 04:53:46 +02004146 if (PyUnicode_READY(unicode) == -1)
4147 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004148 return PyUnicode_GET_LENGTH(unicode);
4149}
4150
4151Py_UCS4
4152PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4153{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004154 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004155 int kind;
4156
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004157 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004158 PyErr_BadArgument();
4159 return (Py_UCS4)-1;
4160 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004161 if (PyUnicode_READY(unicode) == -1) {
4162 return (Py_UCS4)-1;
4163 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004164 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004165 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166 return (Py_UCS4)-1;
4167 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004168 data = PyUnicode_DATA(unicode);
4169 kind = PyUnicode_KIND(unicode);
4170 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004171}
4172
4173int
4174PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4175{
4176 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004177 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178 return -1;
4179 }
Victor Stinner488fa492011-12-12 00:01:39 +01004180 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004181 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004182 PyErr_SetString(PyExc_IndexError, "string index out of range");
4183 return -1;
4184 }
Victor Stinner488fa492011-12-12 00:01:39 +01004185 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004186 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004187 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4188 PyErr_SetString(PyExc_ValueError, "character out of range");
4189 return -1;
4190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004191 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4192 index, ch);
4193 return 0;
4194}
4195
Alexander Belopolsky40018472011-02-26 01:02:56 +00004196const char *
4197PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004198{
Victor Stinner42cb4622010-09-01 19:39:01 +00004199 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004200}
4201
Victor Stinner554f3f02010-06-16 23:33:54 +00004202/* create or adjust a UnicodeDecodeError */
4203static void
4204make_decode_exception(PyObject **exceptionObject,
4205 const char *encoding,
4206 const char *input, Py_ssize_t length,
4207 Py_ssize_t startpos, Py_ssize_t endpos,
4208 const char *reason)
4209{
4210 if (*exceptionObject == NULL) {
4211 *exceptionObject = PyUnicodeDecodeError_Create(
4212 encoding, input, length, startpos, endpos, reason);
4213 }
4214 else {
4215 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4216 goto onError;
4217 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4218 goto onError;
4219 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4220 goto onError;
4221 }
4222 return;
4223
4224onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004225 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004226}
4227
Steve Dowercc16be82016-09-08 10:35:16 -07004228#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004229static int
4230widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4231{
4232 if (newsize > *size) {
4233 wchar_t *newbuf = *buf;
4234 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4235 PyErr_NoMemory();
4236 return -1;
4237 }
4238 *buf = newbuf;
4239 }
4240 *size = newsize;
4241 return 0;
4242}
4243
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244/* error handling callback helper:
4245 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004246 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 and adjust various state variables.
4248 return 0 on success, -1 on error
4249*/
4250
Alexander Belopolsky40018472011-02-26 01:02:56 +00004251static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004252unicode_decode_call_errorhandler_wchar(
4253 const char *errors, PyObject **errorHandler,
4254 const char *encoding, const char *reason,
4255 const char **input, const char **inend, Py_ssize_t *startinpos,
4256 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004257 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004259 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260
4261 PyObject *restuple = NULL;
4262 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004263 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004264 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004265 Py_ssize_t requiredsize;
4266 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004267 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268 wchar_t *repwstr;
4269 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270
4271 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004272 *errorHandler = PyCodec_LookupError(errors);
4273 if (*errorHandler == NULL)
4274 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 }
4276
Victor Stinner554f3f02010-06-16 23:33:54 +00004277 make_decode_exception(exceptionObject,
4278 encoding,
4279 *input, *inend - *input,
4280 *startinpos, *endinpos,
4281 reason);
4282 if (*exceptionObject == NULL)
4283 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284
Petr Viktorinffd97532020-02-11 17:46:57 +01004285 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004289 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004292 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004294
4295 /* Copy back the bytes variables, which might have been modified by the
4296 callback */
4297 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4298 if (!inputobj)
4299 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004300 *input = PyBytes_AS_STRING(inputobj);
4301 insize = PyBytes_GET_SIZE(inputobj);
4302 *inend = *input + insize;
4303 /* we can DECREF safely, as the exception has another reference,
4304 so the object won't go away. */
4305 Py_DECREF(inputobj);
4306
4307 if (newpos<0)
4308 newpos = insize+newpos;
4309 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004310 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004311 goto onError;
4312 }
4313
4314 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4315 if (repwstr == NULL)
4316 goto onError;
4317 /* need more space? (at least enough for what we
4318 have+the replacement+the rest of the string (starting
4319 at the new input position), so we won't have to check space
4320 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004321 requiredsize = *outpos;
4322 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4323 goto overflow;
4324 requiredsize += repwlen;
4325 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4326 goto overflow;
4327 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004328 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004329 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004330 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004332 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004333 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004334 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004335 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004336 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004338 *endinpos = newpos;
4339 *inptr = *input + newpos;
4340
4341 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004342 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343 return 0;
4344
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004345 overflow:
4346 PyErr_SetString(PyExc_OverflowError,
4347 "decoded result is too long for a Python string");
4348
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004349 onError:
4350 Py_XDECREF(restuple);
4351 return -1;
4352}
Steve Dowercc16be82016-09-08 10:35:16 -07004353#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354
4355static int
4356unicode_decode_call_errorhandler_writer(
4357 const char *errors, PyObject **errorHandler,
4358 const char *encoding, const char *reason,
4359 const char **input, const char **inend, Py_ssize_t *startinpos,
4360 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4361 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4362{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004363 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004364
4365 PyObject *restuple = NULL;
4366 PyObject *repunicode = NULL;
4367 Py_ssize_t insize;
4368 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004369 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004370 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004372 int need_to_grow = 0;
4373 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004374
4375 if (*errorHandler == NULL) {
4376 *errorHandler = PyCodec_LookupError(errors);
4377 if (*errorHandler == NULL)
4378 goto onError;
4379 }
4380
4381 make_decode_exception(exceptionObject,
4382 encoding,
4383 *input, *inend - *input,
4384 *startinpos, *endinpos,
4385 reason);
4386 if (*exceptionObject == NULL)
4387 goto onError;
4388
Petr Viktorinffd97532020-02-11 17:46:57 +01004389 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004390 if (restuple == NULL)
4391 goto onError;
4392 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004393 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004394 goto onError;
4395 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004396 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004397 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004398
4399 /* Copy back the bytes variables, which might have been modified by the
4400 callback */
4401 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4402 if (!inputobj)
4403 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004404 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004405 *input = PyBytes_AS_STRING(inputobj);
4406 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004407 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004408 /* we can DECREF safely, as the exception has another reference,
4409 so the object won't go away. */
4410 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004414 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004415 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004416 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004417 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418
Victor Stinner170ca6f2013-04-18 00:25:28 +02004419 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004420 if (replen > 1) {
4421 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004422 need_to_grow = 1;
4423 }
4424 new_inptr = *input + newpos;
4425 if (*inend - new_inptr > remain) {
4426 /* We don't know the decoding algorithm here so we make the worst
4427 assumption that one byte decodes to one unicode character.
4428 If unfortunately one byte could decode to more unicode characters,
4429 the decoder may write out-of-bound then. Is it possible for the
4430 algorithms using this function? */
4431 writer->min_length += *inend - new_inptr - remain;
4432 need_to_grow = 1;
4433 }
4434 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004435 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004436 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004437 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4438 goto onError;
4439 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004440 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004441 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004442
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004444 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004447 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004452 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453}
4454
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004455/* --- UTF-7 Codec -------------------------------------------------------- */
4456
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457/* See RFC2152 for details. We encode conservatively and decode liberally. */
4458
4459/* Three simple macros defining base-64. */
4460
4461/* Is c a base-64 character? */
4462
4463#define IS_BASE64(c) \
4464 (((c) >= 'A' && (c) <= 'Z') || \
4465 ((c) >= 'a' && (c) <= 'z') || \
4466 ((c) >= '0' && (c) <= '9') || \
4467 (c) == '+' || (c) == '/')
4468
4469/* given that c is a base-64 character, what is its base-64 value? */
4470
4471#define FROM_BASE64(c) \
4472 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4473 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4474 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4475 (c) == '+' ? 62 : 63)
4476
4477/* What is the base-64 character of the bottom 6 bits of n? */
4478
4479#define TO_BASE64(n) \
4480 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4481
4482/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4483 * decoded as itself. We are permissive on decoding; the only ASCII
4484 * byte not decoding to itself is the + which begins a base64
4485 * string. */
4486
4487#define DECODE_DIRECT(c) \
4488 ((c) <= 127 && (c) != '+')
4489
4490/* The UTF-7 encoder treats ASCII characters differently according to
4491 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4492 * the above). See RFC2152. This array identifies these different
4493 * sets:
4494 * 0 : "Set D"
4495 * alphanumeric and '(),-./:?
4496 * 1 : "Set O"
4497 * !"#$%&*;<=>@[]^_`{|}
4498 * 2 : "whitespace"
4499 * ht nl cr sp
4500 * 3 : special (must be base64 encoded)
4501 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4502 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503
Tim Petersced69f82003-09-16 20:30:58 +00004504static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505char utf7_category[128] = {
4506/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4507 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4508/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4509 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4510/* sp ! " # $ % & ' ( ) * + , - . / */
4511 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4512/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4513 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4514/* @ A B C D E F G H I J K L M N O */
4515 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4516/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4518/* ` a b c d e f g h i j k l m n o */
4519 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4520/* p q r s t u v w x y z { | } ~ del */
4521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522};
4523
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524/* ENCODE_DIRECT: this character should be encoded as itself. The
4525 * answer depends on whether we are encoding set O as itself, and also
4526 * on whether we are encoding whitespace as itself. RFC2152 makes it
4527 * clear that the answers to these questions vary between
4528 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004529
Antoine Pitrou244651a2009-05-04 18:56:13 +00004530#define ENCODE_DIRECT(c, directO, directWS) \
4531 ((c) < 128 && (c) > 0 && \
4532 ((utf7_category[(c)] == 0) || \
4533 (directWS && (utf7_category[(c)] == 2)) || \
4534 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535
Alexander Belopolsky40018472011-02-26 01:02:56 +00004536PyObject *
4537PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004538 Py_ssize_t size,
4539 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004541 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4542}
4543
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544/* The decoder. The only state we preserve is our read position,
4545 * i.e. how many characters we have consumed. So if we end in the
4546 * middle of a shift sequence we have to back off the read position
4547 * and the output to the beginning of the sequence, otherwise we lose
4548 * all the shift state (seen bits, number of bits seen, high
4549 * surrogate). */
4550
Alexander Belopolsky40018472011-02-26 01:02:56 +00004551PyObject *
4552PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004553 Py_ssize_t size,
4554 const char *errors,
4555 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004556{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004558 Py_ssize_t startinpos;
4559 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004561 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004562 const char *errmsg = "";
4563 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004564 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 unsigned int base64bits = 0;
4566 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004567 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 PyObject *errorHandler = NULL;
4569 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004570
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004571 if (size == 0) {
4572 if (consumed)
4573 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004574 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004575 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004577 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004578 _PyUnicodeWriter_Init(&writer);
4579 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004580
4581 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582 e = s + size;
4583
4584 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004585 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004587 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588
Antoine Pitrou244651a2009-05-04 18:56:13 +00004589 if (inShift) { /* in a base-64 section */
4590 if (IS_BASE64(ch)) { /* consume a base-64 character */
4591 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4592 base64bits += 6;
4593 s++;
4594 if (base64bits >= 16) {
4595 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004596 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 base64bits -= 16;
4598 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004599 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 if (surrogate) {
4601 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004602 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4603 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004604 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004605 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004607 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 }
4609 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004610 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004611 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004613 }
4614 }
Victor Stinner551ac952011-11-29 22:58:13 +01004615 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 /* first surrogate */
4617 surrogate = outCh;
4618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004620 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004621 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 }
4623 }
4624 }
4625 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 if (base64bits > 0) { /* left-over bits */
4628 if (base64bits >= 6) {
4629 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004630 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004631 errmsg = "partial character in shift sequence";
4632 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004633 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 else {
4635 /* Some bits remain; they should be zero */
4636 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004637 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 errmsg = "non-zero padding bits in shift sequence";
4639 goto utf7Error;
4640 }
4641 }
4642 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004643 if (surrogate && DECODE_DIRECT(ch)) {
4644 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4645 goto onError;
4646 }
4647 surrogate = 0;
4648 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 /* '-' is absorbed; other terminating
4650 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004651 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 }
4654 }
4655 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657 s++; /* consume '+' */
4658 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004660 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004661 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004662 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004663 else if (s < e && !IS_BASE64(*s)) {
4664 s++;
4665 errmsg = "ill-formed sequence";
4666 goto utf7Error;
4667 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004670 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004671 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004672 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004673 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004674 }
4675 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004677 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004678 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004679 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004680 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681 else {
4682 startinpos = s-starts;
4683 s++;
4684 errmsg = "unexpected special character";
4685 goto utf7Error;
4686 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004690 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004691 errors, &errorHandler,
4692 "utf7", errmsg,
4693 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004694 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004696 }
4697
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 /* end of string */
4699
4700 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4701 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004702 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703 if (surrogate ||
4704 (base64bits >= 6) ||
4705 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004707 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 errors, &errorHandler,
4709 "utf7", "unterminated shift sequence",
4710 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004711 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712 goto onError;
4713 if (s < e)
4714 goto restart;
4715 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004716 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004717
4718 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004719 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004720 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004721 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004722 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004723 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004724 writer.kind, writer.data, shiftOutStart);
4725 Py_XDECREF(errorHandler);
4726 Py_XDECREF(exc);
4727 _PyUnicodeWriter_Dealloc(&writer);
4728 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004729 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004730 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004731 }
4732 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004733 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004734 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004735 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 Py_XDECREF(errorHandler);
4738 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004739 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740
Benjamin Peterson29060642009-01-31 22:14:21 +00004741 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 Py_XDECREF(errorHandler);
4743 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004744 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745 return NULL;
4746}
4747
4748
Alexander Belopolsky40018472011-02-26 01:02:56 +00004749PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004750_PyUnicode_EncodeUTF7(PyObject *str,
4751 int base64SetO,
4752 int base64WhiteSpace,
4753 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004754{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004755 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004756 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004757 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004758 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004759 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004760 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004761 unsigned int base64bits = 0;
4762 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004763 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004764 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765
Benjamin Petersonbac79492012-01-14 13:34:47 -05004766 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004767 return NULL;
4768 kind = PyUnicode_KIND(str);
4769 data = PyUnicode_DATA(str);
4770 len = PyUnicode_GET_LENGTH(str);
4771
4772 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004775 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004776 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004777 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004778 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004779 if (v == NULL)
4780 return NULL;
4781
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004782 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004783 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004784 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004785
Antoine Pitrou244651a2009-05-04 18:56:13 +00004786 if (inShift) {
4787 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4788 /* shifting out */
4789 if (base64bits) { /* output remaining bits */
4790 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4791 base64buffer = 0;
4792 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004793 }
4794 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004795 /* Characters not in the BASE64 set implicitly unshift the sequence
4796 so no '-' is required, except if the character is itself a '-' */
4797 if (IS_BASE64(ch) || ch == '-') {
4798 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004799 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 *out++ = (char) ch;
4801 }
4802 else {
4803 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004804 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004805 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004806 else { /* not in a shift sequence */
4807 if (ch == '+') {
4808 *out++ = '+';
4809 *out++ = '-';
4810 }
4811 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4812 *out++ = (char) ch;
4813 }
4814 else {
4815 *out++ = '+';
4816 inShift = 1;
4817 goto encode_char;
4818 }
4819 }
4820 continue;
4821encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004823 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004824
Antoine Pitrou244651a2009-05-04 18:56:13 +00004825 /* code first surrogate */
4826 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004827 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004828 while (base64bits >= 6) {
4829 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4830 base64bits -= 6;
4831 }
4832 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004833 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004834 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004835 base64bits += 16;
4836 base64buffer = (base64buffer << 16) | ch;
4837 while (base64bits >= 6) {
4838 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4839 base64bits -= 6;
4840 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004841 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004842 if (base64bits)
4843 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4844 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004845 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004846 if (_PyBytes_Resize(&v, out - start) < 0)
4847 return NULL;
4848 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004849}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004850PyObject *
4851PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4852 Py_ssize_t size,
4853 int base64SetO,
4854 int base64WhiteSpace,
4855 const char *errors)
4856{
4857 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004858 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004859 if (tmp == NULL)
4860 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004861 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004862 base64WhiteSpace, errors);
4863 Py_DECREF(tmp);
4864 return result;
4865}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004866
Antoine Pitrou244651a2009-05-04 18:56:13 +00004867#undef IS_BASE64
4868#undef FROM_BASE64
4869#undef TO_BASE64
4870#undef DECODE_DIRECT
4871#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004872
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873/* --- UTF-8 Codec -------------------------------------------------------- */
4874
Alexander Belopolsky40018472011-02-26 01:02:56 +00004875PyObject *
4876PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004877 Py_ssize_t size,
4878 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879{
Walter Dörwald69652032004-09-07 20:24:22 +00004880 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4881}
4882
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004883#include "stringlib/asciilib.h"
4884#include "stringlib/codecs.h"
4885#include "stringlib/undef.h"
4886
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004887#include "stringlib/ucs1lib.h"
4888#include "stringlib/codecs.h"
4889#include "stringlib/undef.h"
4890
4891#include "stringlib/ucs2lib.h"
4892#include "stringlib/codecs.h"
4893#include "stringlib/undef.h"
4894
4895#include "stringlib/ucs4lib.h"
4896#include "stringlib/codecs.h"
4897#include "stringlib/undef.h"
4898
Antoine Pitrouab868312009-01-10 15:40:25 +00004899/* Mask to quickly check whether a C 'long' contains a
4900 non-ASCII, UTF8-encoded char. */
4901#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004902# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004903#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004904# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004905#else
4906# error C 'long' size should be either 4 or 8!
4907#endif
4908
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004909static Py_ssize_t
4910ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004911{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004912 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004913 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004914
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004915 /*
4916 * Issue #17237: m68k is a bit different from most architectures in
4917 * that objects do not use "natural alignment" - for example, int and
4918 * long are only aligned at 2-byte boundaries. Therefore the assert()
4919 * won't work; also, tests have shown that skipping the "optimised
4920 * version" will even speed up m68k.
4921 */
4922#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004924 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4925 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 /* Fast path, see in STRINGLIB(utf8_decode) for
4927 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004928 /* Help allocation */
4929 const char *_p = p;
4930 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004931 while (_p < aligned_end) {
4932 unsigned long value = *(const unsigned long *) _p;
4933 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004935 *((unsigned long *)q) = value;
4936 _p += SIZEOF_LONG;
4937 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004938 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004939 p = _p;
4940 while (p < end) {
4941 if ((unsigned char)*p & 0x80)
4942 break;
4943 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004948#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004949 while (p < end) {
4950 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4951 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004952 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004953 /* Help allocation */
4954 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004956 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004957 if (value & ASCII_CHAR_MASK)
4958 break;
4959 _p += SIZEOF_LONG;
4960 }
4961 p = _p;
4962 if (_p == end)
4963 break;
4964 }
4965 if ((unsigned char)*p & 0x80)
4966 break;
4967 ++p;
4968 }
4969 memcpy(dest, start, p - start);
4970 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971}
Antoine Pitrouab868312009-01-10 15:40:25 +00004972
Victor Stinner709d23d2019-05-02 14:56:30 -04004973static PyObject *
4974unicode_decode_utf8(const char *s, Py_ssize_t size,
4975 _Py_error_handler error_handler, const char *errors,
4976 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004977{
Victor Stinner785938e2011-12-11 20:09:03 +01004978 if (size == 0) {
4979 if (consumed)
4980 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004981 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004982 }
4983
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4985 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004986 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 *consumed = 1;
4988 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004989 }
4990
Inada Naoki770847a2019-06-24 12:30:24 +09004991 const char *starts = s;
4992 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004993
Inada Naoki770847a2019-06-24 12:30:24 +09004994 // fast path: try ASCII string.
4995 PyObject *u = PyUnicode_New(size, 127);
4996 if (u == NULL) {
4997 return NULL;
4998 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004999 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005000 if (s == end) {
5001 return u;
5002 }
5003
5004 // Use _PyUnicodeWriter after fast path is failed.
5005 _PyUnicodeWriter writer;
5006 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5007 writer.pos = s - starts;
5008
5009 Py_ssize_t startinpos, endinpos;
5010 const char *errmsg = "";
5011 PyObject *error_handler_obj = NULL;
5012 PyObject *exc = NULL;
5013
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005014 while (s < end) {
5015 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005016 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005017
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005018 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005019 if (PyUnicode_IS_ASCII(writer.buffer))
5020 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005022 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005023 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005024 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005025 } else {
5026 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005027 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005028 }
5029
5030 switch (ch) {
5031 case 0:
5032 if (s == end || consumed)
5033 goto End;
5034 errmsg = "unexpected end of data";
5035 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005036 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005037 break;
5038 case 1:
5039 errmsg = "invalid start byte";
5040 startinpos = s - starts;
5041 endinpos = startinpos + 1;
5042 break;
5043 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005044 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5045 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5046 {
5047 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005048 goto End;
5049 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005050 /* fall through */
5051 case 3:
5052 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005053 errmsg = "invalid continuation byte";
5054 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005055 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005056 break;
5057 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005058 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059 goto onError;
5060 continue;
5061 }
5062
Victor Stinner1d65d912015-10-05 13:43:50 +02005063 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005064 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005065
5066 switch (error_handler) {
5067 case _Py_ERROR_IGNORE:
5068 s += (endinpos - startinpos);
5069 break;
5070
5071 case _Py_ERROR_REPLACE:
5072 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5073 goto onError;
5074 s += (endinpos - startinpos);
5075 break;
5076
5077 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005078 {
5079 Py_ssize_t i;
5080
Victor Stinner1d65d912015-10-05 13:43:50 +02005081 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5082 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005083 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005084 ch = (Py_UCS4)(unsigned char)(starts[i]);
5085 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5086 ch + 0xdc00);
5087 writer.pos++;
5088 }
5089 s += (endinpos - startinpos);
5090 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005091 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005092
5093 default:
5094 if (unicode_decode_call_errorhandler_writer(
5095 errors, &error_handler_obj,
5096 "utf-8", errmsg,
5097 &starts, &end, &startinpos, &endinpos, &exc, &s,
5098 &writer))
5099 goto onError;
5100 }
Victor Stinner785938e2011-12-11 20:09:03 +01005101 }
5102
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005103End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005104 if (consumed)
5105 *consumed = s - starts;
5106
Victor Stinner1d65d912015-10-05 13:43:50 +02005107 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005108 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005109 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005110
5111onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005112 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005114 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005115 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005116}
5117
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005118
Victor Stinner709d23d2019-05-02 14:56:30 -04005119PyObject *
5120PyUnicode_DecodeUTF8Stateful(const char *s,
5121 Py_ssize_t size,
5122 const char *errors,
5123 Py_ssize_t *consumed)
5124{
5125 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5126}
5127
5128
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005129/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5130 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005131
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005132 On success, write a pointer to a newly allocated wide character string into
5133 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5134 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005135
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005136 On memory allocation failure, return -1.
5137
5138 On decoding error (if surrogateescape is zero), return -2. If wlen is
5139 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5140 is not NULL, write the decoding error message into *reason. */
5141int
5142_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005143 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005145 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 wchar_t *unicode;
5148 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005149
Victor Stinner3d4226a2018-08-29 22:21:32 +02005150 int surrogateescape = 0;
5151 int surrogatepass = 0;
5152 switch (errors)
5153 {
5154 case _Py_ERROR_STRICT:
5155 break;
5156 case _Py_ERROR_SURROGATEESCAPE:
5157 surrogateescape = 1;
5158 break;
5159 case _Py_ERROR_SURROGATEPASS:
5160 surrogatepass = 1;
5161 break;
5162 default:
5163 return -3;
5164 }
5165
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005166 /* Note: size will always be longer than the resulting Unicode
5167 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005168 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005169 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005170 }
5171
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005172 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005173 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005174 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005175 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005176
5177 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005178 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005179 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005180 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005181 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005182#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005183 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005184#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005185 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005186#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005187 if (ch > 0xFF) {
5188#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005189 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005190#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005191 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005192 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005193 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5194 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5195#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005196 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005197 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005198 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005199 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005200 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005201
5202 if (surrogateescape) {
5203 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5204 }
5205 else {
5206 /* Is it a valid three-byte code? */
5207 if (surrogatepass
5208 && (e - s) >= 3
5209 && (s[0] & 0xf0) == 0xe0
5210 && (s[1] & 0xc0) == 0x80
5211 && (s[2] & 0xc0) == 0x80)
5212 {
5213 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5214 s += 3;
5215 unicode[outpos++] = ch;
5216 }
5217 else {
5218 PyMem_RawFree(unicode );
5219 if (reason != NULL) {
5220 switch (ch) {
5221 case 0:
5222 *reason = "unexpected end of data";
5223 break;
5224 case 1:
5225 *reason = "invalid start byte";
5226 break;
5227 /* 2, 3, 4 */
5228 default:
5229 *reason = "invalid continuation byte";
5230 break;
5231 }
5232 }
5233 if (wlen != NULL) {
5234 *wlen = s - orig_s;
5235 }
5236 return -2;
5237 }
5238 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005239 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005240 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005241 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005242 if (wlen) {
5243 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005244 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005245 *wstr = unicode;
5246 return 0;
5247}
5248
Victor Stinner5f9cf232019-03-19 01:46:25 +01005249
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005250wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005251_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5252 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005253{
5254 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005255 int res = _Py_DecodeUTF8Ex(arg, arglen,
5256 &wstr, wlen,
5257 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005258 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005259 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5260 assert(res != -3);
5261 if (wlen) {
5262 *wlen = (size_t)res;
5263 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005264 return NULL;
5265 }
5266 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005267}
5268
Antoine Pitrouab868312009-01-10 15:40:25 +00005269
Victor Stinnere47e6982017-12-21 15:45:16 +01005270/* UTF-8 encoder using the surrogateescape error handler .
5271
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005272 On success, return 0 and write the newly allocated character string (use
5273 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005274
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005275 On encoding failure, return -2 and write the position of the invalid
5276 surrogate character into *error_pos (if error_pos is set) and the decoding
5277 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005278
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005279 On memory allocation failure, return -1. */
5280int
5281_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005282 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005283{
5284 const Py_ssize_t max_char_size = 4;
5285 Py_ssize_t len = wcslen(text);
5286
5287 assert(len >= 0);
5288
Victor Stinner3d4226a2018-08-29 22:21:32 +02005289 int surrogateescape = 0;
5290 int surrogatepass = 0;
5291 switch (errors)
5292 {
5293 case _Py_ERROR_STRICT:
5294 break;
5295 case _Py_ERROR_SURROGATEESCAPE:
5296 surrogateescape = 1;
5297 break;
5298 case _Py_ERROR_SURROGATEPASS:
5299 surrogatepass = 1;
5300 break;
5301 default:
5302 return -3;
5303 }
5304
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005305 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5306 return -1;
5307 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005308 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005309 if (raw_malloc) {
5310 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005311 }
5312 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005313 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005314 }
5315 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005316 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005317 }
5318
5319 char *p = bytes;
5320 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005321 for (i = 0; i < len; ) {
5322 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005323 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005324 i++;
5325#if Py_UNICODE_SIZE == 2
5326 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5327 && i < len
5328 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5329 {
5330 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5331 i++;
5332 }
5333#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005334
5335 if (ch < 0x80) {
5336 /* Encode ASCII */
5337 *p++ = (char) ch;
5338
5339 }
5340 else if (ch < 0x0800) {
5341 /* Encode Latin-1 */
5342 *p++ = (char)(0xc0 | (ch >> 6));
5343 *p++ = (char)(0x80 | (ch & 0x3f));
5344 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005345 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005346 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005347 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005348 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005349 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005350 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005351 if (reason != NULL) {
5352 *reason = "encoding error";
5353 }
5354 if (raw_malloc) {
5355 PyMem_RawFree(bytes);
5356 }
5357 else {
5358 PyMem_Free(bytes);
5359 }
5360 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005361 }
5362 *p++ = (char)(ch & 0xff);
5363 }
5364 else if (ch < 0x10000) {
5365 *p++ = (char)(0xe0 | (ch >> 12));
5366 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5367 *p++ = (char)(0x80 | (ch & 0x3f));
5368 }
5369 else { /* ch >= 0x10000 */
5370 assert(ch <= MAX_UNICODE);
5371 /* Encode UCS4 Unicode ordinals */
5372 *p++ = (char)(0xf0 | (ch >> 18));
5373 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5374 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5375 *p++ = (char)(0x80 | (ch & 0x3f));
5376 }
5377 }
5378 *p++ = '\0';
5379
5380 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005381 char *bytes2;
5382 if (raw_malloc) {
5383 bytes2 = PyMem_RawRealloc(bytes, final_size);
5384 }
5385 else {
5386 bytes2 = PyMem_Realloc(bytes, final_size);
5387 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005388 if (bytes2 == NULL) {
5389 if (error_pos != NULL) {
5390 *error_pos = (size_t)-1;
5391 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005392 if (raw_malloc) {
5393 PyMem_RawFree(bytes);
5394 }
5395 else {
5396 PyMem_Free(bytes);
5397 }
5398 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005399 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005400 *str = bytes2;
5401 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005402}
5403
5404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005405/* Primary internal function which creates utf8 encoded bytes objects.
5406
5407 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005408 and allocate exactly as much space needed at the end. Else allocate the
5409 maximum possible needed (4 result bytes per Unicode character), and return
5410 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005411*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005412static PyObject *
5413unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5414 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005416 if (!PyUnicode_Check(unicode)) {
5417 PyErr_BadArgument();
5418 return NULL;
5419 }
5420
5421 if (PyUnicode_READY(unicode) == -1)
5422 return NULL;
5423
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005424 if (PyUnicode_UTF8(unicode))
5425 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5426 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005427
Inada Naoki02a4d572020-02-27 13:48:59 +09005428 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005429 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005430 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5431
5432 _PyBytesWriter writer;
5433 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005434
Benjamin Petersonead6b532011-12-20 17:23:42 -06005435 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005436 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005437 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005438 case PyUnicode_1BYTE_KIND:
5439 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5440 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005441 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5442 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005443 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005444 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5445 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005446 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005447 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5448 break;
Tim Peters602f7402002-04-27 18:03:26 +00005449 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005450
5451 if (end == NULL) {
5452 _PyBytesWriter_Dealloc(&writer);
5453 return NULL;
5454 }
5455 return _PyBytesWriter_Finish(&writer, end);
5456}
5457
5458static int
5459unicode_fill_utf8(PyObject *unicode)
5460{
5461 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5462 assert(!PyUnicode_IS_ASCII(unicode));
5463
5464 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005465 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005466 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5467
5468 _PyBytesWriter writer;
5469 char *end;
5470
5471 switch (kind) {
5472 default:
5473 Py_UNREACHABLE();
5474 case PyUnicode_1BYTE_KIND:
5475 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5476 _Py_ERROR_STRICT, NULL);
5477 break;
5478 case PyUnicode_2BYTE_KIND:
5479 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5480 _Py_ERROR_STRICT, NULL);
5481 break;
5482 case PyUnicode_4BYTE_KIND:
5483 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5484 _Py_ERROR_STRICT, NULL);
5485 break;
5486 }
5487 if (end == NULL) {
5488 _PyBytesWriter_Dealloc(&writer);
5489 return -1;
5490 }
5491
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005492 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005493 PyBytes_AS_STRING(writer.buffer);
5494 Py_ssize_t len = end - start;
5495
5496 char *cache = PyObject_MALLOC(len + 1);
5497 if (cache == NULL) {
5498 _PyBytesWriter_Dealloc(&writer);
5499 PyErr_NoMemory();
5500 return -1;
5501 }
5502 _PyUnicode_UTF8(unicode) = cache;
5503 _PyUnicode_UTF8_LENGTH(unicode) = len;
5504 memcpy(cache, start, len);
5505 cache[len] = '\0';
5506 _PyBytesWriter_Dealloc(&writer);
5507 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508}
5509
Alexander Belopolsky40018472011-02-26 01:02:56 +00005510PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005511_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5512{
5513 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5514}
5515
5516
5517PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005518PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5519 Py_ssize_t size,
5520 const char *errors)
5521{
5522 PyObject *v, *unicode;
5523
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005524 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005525 if (unicode == NULL)
5526 return NULL;
5527 v = _PyUnicode_AsUTF8String(unicode, errors);
5528 Py_DECREF(unicode);
5529 return v;
5530}
5531
5532PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005533PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005535 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536}
5537
Walter Dörwald41980ca2007-08-16 21:55:45 +00005538/* --- UTF-32 Codec ------------------------------------------------------- */
5539
5540PyObject *
5541PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 Py_ssize_t size,
5543 const char *errors,
5544 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005545{
5546 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5547}
5548
5549PyObject *
5550PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 Py_ssize_t size,
5552 const char *errors,
5553 int *byteorder,
5554 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005555{
5556 const char *starts = s;
5557 Py_ssize_t startinpos;
5558 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005559 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005560 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005561 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005562 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005563 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005564 PyObject *errorHandler = NULL;
5565 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005566
Andy Lestere6be9b52020-02-11 20:28:35 -06005567 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005568 e = q + size;
5569
5570 if (byteorder)
5571 bo = *byteorder;
5572
5573 /* Check for BOM marks (U+FEFF) in the input and adjust current
5574 byte order setting accordingly. In native mode, the leading BOM
5575 mark is skipped, in all other modes, it is copied to the output
5576 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005577 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005578 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005579 if (bom == 0x0000FEFF) {
5580 bo = -1;
5581 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005583 else if (bom == 0xFFFE0000) {
5584 bo = 1;
5585 q += 4;
5586 }
5587 if (byteorder)
5588 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005589 }
5590
Victor Stinnere64322e2012-10-30 23:12:47 +01005591 if (q == e) {
5592 if (consumed)
5593 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005594 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005595 }
5596
Victor Stinnere64322e2012-10-30 23:12:47 +01005597#ifdef WORDS_BIGENDIAN
5598 le = bo < 0;
5599#else
5600 le = bo <= 0;
5601#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005602 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005603
Victor Stinner8f674cc2013-04-17 23:02:17 +02005604 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005605 writer.min_length = (e - q + 3) / 4;
5606 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005607 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005608
Victor Stinnere64322e2012-10-30 23:12:47 +01005609 while (1) {
5610 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005611 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005612
Victor Stinnere64322e2012-10-30 23:12:47 +01005613 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005614 enum PyUnicode_Kind kind = writer.kind;
5615 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005616 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005617 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005618 if (le) {
5619 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005620 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005621 if (ch > maxch)
5622 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005623 if (kind != PyUnicode_1BYTE_KIND &&
5624 Py_UNICODE_IS_SURROGATE(ch))
5625 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005626 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005627 q += 4;
5628 } while (q <= last);
5629 }
5630 else {
5631 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005632 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005633 if (ch > maxch)
5634 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005635 if (kind != PyUnicode_1BYTE_KIND &&
5636 Py_UNICODE_IS_SURROGATE(ch))
5637 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005638 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005639 q += 4;
5640 } while (q <= last);
5641 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005642 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005643 }
5644
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005645 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005646 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005647 startinpos = ((const char *)q) - starts;
5648 endinpos = startinpos + 4;
5649 }
5650 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005651 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005653 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005654 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005655 startinpos = ((const char *)q) - starts;
5656 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005658 else {
5659 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005660 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005661 goto onError;
5662 q += 4;
5663 continue;
5664 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005665 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005666 startinpos = ((const char *)q) - starts;
5667 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005669
5670 /* The remaining input chars are ignored if the callback
5671 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005672 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005674 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005676 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005678 }
5679
Walter Dörwald41980ca2007-08-16 21:55:45 +00005680 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005682
Walter Dörwald41980ca2007-08-16 21:55:45 +00005683 Py_XDECREF(errorHandler);
5684 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005685 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005686
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005689 Py_XDECREF(errorHandler);
5690 Py_XDECREF(exc);
5691 return NULL;
5692}
5693
5694PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005695_PyUnicode_EncodeUTF32(PyObject *str,
5696 const char *errors,
5697 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005698{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005699 enum PyUnicode_Kind kind;
5700 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005701 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005702 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005703 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005704#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005705 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005706#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005707 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005708#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005709 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005710 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005711 PyObject *errorHandler = NULL;
5712 PyObject *exc = NULL;
5713 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005714
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005715 if (!PyUnicode_Check(str)) {
5716 PyErr_BadArgument();
5717 return NULL;
5718 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005719 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005720 return NULL;
5721 kind = PyUnicode_KIND(str);
5722 data = PyUnicode_DATA(str);
5723 len = PyUnicode_GET_LENGTH(str);
5724
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005725 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005726 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005727 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005728 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005729 if (v == NULL)
5730 return NULL;
5731
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005732 /* output buffer is 4-bytes aligned */
5733 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005734 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005735 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005736 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005737 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005738 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005739
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005740 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005741 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005742 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005743 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005744 else
5745 encoding = "utf-32";
5746
5747 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005748 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5749 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005750 }
5751
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005752 pos = 0;
5753 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005754 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005755
5756 if (kind == PyUnicode_2BYTE_KIND) {
5757 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5758 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005759 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005760 else {
5761 assert(kind == PyUnicode_4BYTE_KIND);
5762 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5763 &out, native_ordering);
5764 }
5765 if (pos == len)
5766 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005767
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005768 rep = unicode_encode_call_errorhandler(
5769 errors, &errorHandler,
5770 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005771 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005772 if (!rep)
5773 goto error;
5774
5775 if (PyBytes_Check(rep)) {
5776 repsize = PyBytes_GET_SIZE(rep);
5777 if (repsize & 3) {
5778 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005779 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005780 "surrogates not allowed");
5781 goto error;
5782 }
5783 moreunits = repsize / 4;
5784 }
5785 else {
5786 assert(PyUnicode_Check(rep));
5787 if (PyUnicode_READY(rep) < 0)
5788 goto error;
5789 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5790 if (!PyUnicode_IS_ASCII(rep)) {
5791 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005792 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005793 "surrogates not allowed");
5794 goto error;
5795 }
5796 }
5797
5798 /* four bytes are reserved for each surrogate */
5799 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005800 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005801 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005802 /* integer overflow */
5803 PyErr_NoMemory();
5804 goto error;
5805 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005806 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005807 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005808 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005809 }
5810
5811 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005812 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005813 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005814 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005815 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005816 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5817 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005818 }
5819
5820 Py_CLEAR(rep);
5821 }
5822
5823 /* Cut back to size actually needed. This is necessary for, for example,
5824 encoding of a string containing isolated surrogates and the 'ignore'
5825 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005826 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005827 if (nsize != PyBytes_GET_SIZE(v))
5828 _PyBytes_Resize(&v, nsize);
5829 Py_XDECREF(errorHandler);
5830 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005831 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005832 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005833 error:
5834 Py_XDECREF(rep);
5835 Py_XDECREF(errorHandler);
5836 Py_XDECREF(exc);
5837 Py_XDECREF(v);
5838 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005839}
5840
Alexander Belopolsky40018472011-02-26 01:02:56 +00005841PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005842PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5843 Py_ssize_t size,
5844 const char *errors,
5845 int byteorder)
5846{
5847 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005848 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005849 if (tmp == NULL)
5850 return NULL;
5851 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5852 Py_DECREF(tmp);
5853 return result;
5854}
5855
5856PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005857PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005858{
Victor Stinnerb960b342011-11-20 19:12:52 +01005859 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005860}
5861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862/* --- UTF-16 Codec ------------------------------------------------------- */
5863
Tim Peters772747b2001-08-09 22:21:55 +00005864PyObject *
5865PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 Py_ssize_t size,
5867 const char *errors,
5868 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869{
Walter Dörwald69652032004-09-07 20:24:22 +00005870 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5871}
5872
5873PyObject *
5874PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 Py_ssize_t size,
5876 const char *errors,
5877 int *byteorder,
5878 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005879{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005881 Py_ssize_t startinpos;
5882 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005883 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005884 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005885 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005886 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005887 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005888 PyObject *errorHandler = NULL;
5889 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005890 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
Andy Lestere6be9b52020-02-11 20:28:35 -06005892 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005893 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894
5895 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005896 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005898 /* Check for BOM marks (U+FEFF) in the input and adjust current
5899 byte order setting accordingly. In native mode, the leading BOM
5900 mark is skipped, in all other modes, it is copied to the output
5901 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005902 if (bo == 0 && size >= 2) {
5903 const Py_UCS4 bom = (q[1] << 8) | q[0];
5904 if (bom == 0xFEFF) {
5905 q += 2;
5906 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005908 else if (bom == 0xFFFE) {
5909 q += 2;
5910 bo = 1;
5911 }
5912 if (byteorder)
5913 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915
Antoine Pitrou63065d72012-05-15 23:48:04 +02005916 if (q == e) {
5917 if (consumed)
5918 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005919 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005920 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005921
Christian Heimes743e0cd2012-10-17 23:52:17 +02005922#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005923 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005924 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005925#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005926 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005927 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005928#endif
Tim Peters772747b2001-08-09 22:21:55 +00005929
Antoine Pitrou63065d72012-05-15 23:48:04 +02005930 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005931 character count normally. Error handler will take care of
5932 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005933 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005934 writer.min_length = (e - q + 1) / 2;
5935 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005936 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005937
Antoine Pitrou63065d72012-05-15 23:48:04 +02005938 while (1) {
5939 Py_UCS4 ch = 0;
5940 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005941 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005942 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005943 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005944 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005945 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005946 native_ordering);
5947 else
5948 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005949 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005950 native_ordering);
5951 } else if (kind == PyUnicode_2BYTE_KIND) {
5952 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005953 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005954 native_ordering);
5955 } else {
5956 assert(kind == PyUnicode_4BYTE_KIND);
5957 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005958 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005959 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005960 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005961 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005962
Antoine Pitrou63065d72012-05-15 23:48:04 +02005963 switch (ch)
5964 {
5965 case 0:
5966 /* remaining byte at the end? (size should be even) */
5967 if (q == e || consumed)
5968 goto End;
5969 errmsg = "truncated data";
5970 startinpos = ((const char *)q) - starts;
5971 endinpos = ((const char *)e) - starts;
5972 break;
5973 /* The remaining input chars are ignored if the callback
5974 chooses to skip the input */
5975 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005976 q -= 2;
5977 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005978 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005979 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005980 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005981 endinpos = ((const char *)e) - starts;
5982 break;
5983 case 2:
5984 errmsg = "illegal encoding";
5985 startinpos = ((const char *)q) - 2 - starts;
5986 endinpos = startinpos + 2;
5987 break;
5988 case 3:
5989 errmsg = "illegal UTF-16 surrogate";
5990 startinpos = ((const char *)q) - 4 - starts;
5991 endinpos = startinpos + 2;
5992 break;
5993 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005994 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005995 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 continue;
5997 }
5998
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005999 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006000 errors,
6001 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006002 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006003 &starts,
6004 (const char **)&e,
6005 &startinpos,
6006 &endinpos,
6007 &exc,
6008 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006009 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 }
6012
Antoine Pitrou63065d72012-05-15 23:48:04 +02006013End:
Walter Dörwald69652032004-09-07 20:24:22 +00006014 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006016
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017 Py_XDECREF(errorHandler);
6018 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006019 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006022 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 Py_XDECREF(errorHandler);
6024 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 return NULL;
6026}
6027
Tim Peters772747b2001-08-09 22:21:55 +00006028PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006029_PyUnicode_EncodeUTF16(PyObject *str,
6030 const char *errors,
6031 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006033 enum PyUnicode_Kind kind;
6034 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006035 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006036 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006037 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006038 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006039#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006040 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006041#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006042 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006043#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006044 const char *encoding;
6045 Py_ssize_t nsize, pos;
6046 PyObject *errorHandler = NULL;
6047 PyObject *exc = NULL;
6048 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006049
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006050 if (!PyUnicode_Check(str)) {
6051 PyErr_BadArgument();
6052 return NULL;
6053 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006054 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006055 return NULL;
6056 kind = PyUnicode_KIND(str);
6057 data = PyUnicode_DATA(str);
6058 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006059
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006060 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006061 if (kind == PyUnicode_4BYTE_KIND) {
6062 const Py_UCS4 *in = (const Py_UCS4 *)data;
6063 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006064 while (in < end) {
6065 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006066 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006067 }
6068 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006069 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006070 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006072 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006073 nsize = len + pairs + (byteorder == 0);
6074 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006075 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006079 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006080 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006081 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006082 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006083 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006084 }
6085 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006086 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006087 }
Tim Peters772747b2001-08-09 22:21:55 +00006088
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006089 if (kind == PyUnicode_1BYTE_KIND) {
6090 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6091 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006092 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006093
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006094 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006095 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006096 }
6097 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006098 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006099 }
6100 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006101 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006102 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006103
6104 pos = 0;
6105 while (pos < len) {
6106 Py_ssize_t repsize, moreunits;
6107
6108 if (kind == PyUnicode_2BYTE_KIND) {
6109 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6110 &out, native_ordering);
6111 }
6112 else {
6113 assert(kind == PyUnicode_4BYTE_KIND);
6114 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6115 &out, native_ordering);
6116 }
6117 if (pos == len)
6118 break;
6119
6120 rep = unicode_encode_call_errorhandler(
6121 errors, &errorHandler,
6122 encoding, "surrogates not allowed",
6123 str, &exc, pos, pos + 1, &pos);
6124 if (!rep)
6125 goto error;
6126
6127 if (PyBytes_Check(rep)) {
6128 repsize = PyBytes_GET_SIZE(rep);
6129 if (repsize & 1) {
6130 raise_encode_exception(&exc, encoding,
6131 str, pos - 1, pos,
6132 "surrogates not allowed");
6133 goto error;
6134 }
6135 moreunits = repsize / 2;
6136 }
6137 else {
6138 assert(PyUnicode_Check(rep));
6139 if (PyUnicode_READY(rep) < 0)
6140 goto error;
6141 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6142 if (!PyUnicode_IS_ASCII(rep)) {
6143 raise_encode_exception(&exc, encoding,
6144 str, pos - 1, pos,
6145 "surrogates not allowed");
6146 goto error;
6147 }
6148 }
6149
6150 /* two bytes are reserved for each surrogate */
6151 if (moreunits > 1) {
6152 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006153 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006154 /* integer overflow */
6155 PyErr_NoMemory();
6156 goto error;
6157 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006158 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006159 goto error;
6160 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6161 }
6162
6163 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006164 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006165 out += moreunits;
6166 } else /* rep is unicode */ {
6167 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6168 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6169 &out, native_ordering);
6170 }
6171
6172 Py_CLEAR(rep);
6173 }
6174
6175 /* Cut back to size actually needed. This is necessary for, for example,
6176 encoding of a string containing isolated surrogates and the 'ignore' handler
6177 is used. */
6178 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6179 if (nsize != PyBytes_GET_SIZE(v))
6180 _PyBytes_Resize(&v, nsize);
6181 Py_XDECREF(errorHandler);
6182 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006183 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006184 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006185 error:
6186 Py_XDECREF(rep);
6187 Py_XDECREF(errorHandler);
6188 Py_XDECREF(exc);
6189 Py_XDECREF(v);
6190 return NULL;
6191#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192}
6193
Alexander Belopolsky40018472011-02-26 01:02:56 +00006194PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6196 Py_ssize_t size,
6197 const char *errors,
6198 int byteorder)
6199{
6200 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006201 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006202 if (tmp == NULL)
6203 return NULL;
6204 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6205 Py_DECREF(tmp);
6206 return result;
6207}
6208
6209PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006210PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006212 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213}
6214
6215/* --- Unicode Escape Codec ----------------------------------------------- */
6216
Fredrik Lundh06d12682001-01-24 07:59:11 +00006217static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006218
Alexander Belopolsky40018472011-02-26 01:02:56 +00006219PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006220_PyUnicode_DecodeUnicodeEscape(const char *s,
6221 Py_ssize_t size,
6222 const char *errors,
6223 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006225 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006226 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006228 PyObject *errorHandler = NULL;
6229 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006230
Eric V. Smith42454af2016-10-31 09:22:08 -04006231 // so we can remember if we've seen an invalid escape char or not
6232 *first_invalid_escape = NULL;
6233
Victor Stinner62ec3312016-09-06 17:04:34 -07006234 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006235 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 }
6237 /* Escaped strings will always be longer than the resulting
6238 Unicode string, so we start with size here and then reduce the
6239 length after conversion to the true value.
6240 (but if the error callback returns a long replacement string
6241 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006242 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006243 writer.min_length = size;
6244 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6245 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006246 }
6247
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 end = s + size;
6249 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 unsigned char c = (unsigned char) *s++;
6251 Py_UCS4 ch;
6252 int count;
6253 Py_ssize_t startinpos;
6254 Py_ssize_t endinpos;
6255 const char *message;
6256
6257#define WRITE_ASCII_CHAR(ch) \
6258 do { \
6259 assert(ch <= 127); \
6260 assert(writer.pos < writer.size); \
6261 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6262 } while(0)
6263
6264#define WRITE_CHAR(ch) \
6265 do { \
6266 if (ch <= writer.maxchar) { \
6267 assert(writer.pos < writer.size); \
6268 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6269 } \
6270 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6271 goto onError; \
6272 } \
6273 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274
6275 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006276 if (c != '\\') {
6277 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 continue;
6279 }
6280
Victor Stinner62ec3312016-09-06 17:04:34 -07006281 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 if (s >= end) {
6284 message = "\\ at end of string";
6285 goto error;
6286 }
6287 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006288
Victor Stinner62ec3312016-09-06 17:04:34 -07006289 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006290 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006293 case '\n': continue;
6294 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6295 case '\'': WRITE_ASCII_CHAR('\''); continue;
6296 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6297 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006298 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006299 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6300 case 't': WRITE_ASCII_CHAR('\t'); continue;
6301 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6302 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006303 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006305 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006306 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 case '0': case '1': case '2': case '3':
6310 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006312 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006313 ch = (ch<<3) + *s++ - '0';
6314 if (s < end && '0' <= *s && *s <= '7') {
6315 ch = (ch<<3) + *s++ - '0';
6316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006318 WRITE_CHAR(ch);
6319 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 /* hex escapes */
6322 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006324 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006325 message = "truncated \\xXX escape";
6326 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006331 message = "truncated \\uXXXX escape";
6332 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006335 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006337 message = "truncated \\UXXXXXXXX escape";
6338 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006339 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006340 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 ch <<= 4;
6342 if (c >= '0' && c <= '9') {
6343 ch += c - '0';
6344 }
6345 else if (c >= 'a' && c <= 'f') {
6346 ch += c - ('a' - 10);
6347 }
6348 else if (c >= 'A' && c <= 'F') {
6349 ch += c - ('A' - 10);
6350 }
6351 else {
6352 break;
6353 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006354 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006355 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006356 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006357 }
6358
6359 /* when we get here, ch is a 32-bit unicode character */
6360 if (ch > MAX_UNICODE) {
6361 message = "illegal Unicode character";
6362 goto error;
6363 }
6364
6365 WRITE_CHAR(ch);
6366 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006367
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006369 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006370 if (ucnhash_CAPI == NULL) {
6371 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006372 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6373 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006374 if (ucnhash_CAPI == NULL) {
6375 PyErr_SetString(
6376 PyExc_UnicodeError,
6377 "\\N escapes not supported (can't load unicodedata module)"
6378 );
6379 goto onError;
6380 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006381 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006382
6383 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006384 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006385 const char *start = ++s;
6386 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006387 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006388 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006389 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006390 namelen = s - start;
6391 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006392 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006393 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006394 ch = 0xffffffff; /* in case 'getcode' messes up */
6395 if (namelen <= INT_MAX &&
6396 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6397 &ch, 0)) {
6398 assert(ch <= MAX_UNICODE);
6399 WRITE_CHAR(ch);
6400 continue;
6401 }
6402 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006403 }
6404 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006405 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006406
6407 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006408 if (*first_invalid_escape == NULL) {
6409 *first_invalid_escape = s-1; /* Back up one char, since we've
6410 already incremented s. */
6411 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 WRITE_ASCII_CHAR('\\');
6413 WRITE_CHAR(c);
6414 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006416
6417 error:
6418 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006420 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006421 errors, &errorHandler,
6422 "unicodeescape", message,
6423 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006425 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006427 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006428
6429#undef WRITE_ASCII_CHAR
6430#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006432
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006433 Py_XDECREF(errorHandler);
6434 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006435 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006436
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006438 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006439 Py_XDECREF(errorHandler);
6440 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 return NULL;
6442}
6443
Eric V. Smith42454af2016-10-31 09:22:08 -04006444PyObject *
6445PyUnicode_DecodeUnicodeEscape(const char *s,
6446 Py_ssize_t size,
6447 const char *errors)
6448{
6449 const char *first_invalid_escape;
6450 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6451 &first_invalid_escape);
6452 if (result == NULL)
6453 return NULL;
6454 if (first_invalid_escape != NULL) {
6455 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6456 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006457 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006458 Py_DECREF(result);
6459 return NULL;
6460 }
6461 }
6462 return result;
6463}
6464
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006465/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466
Alexander Belopolsky40018472011-02-26 01:02:56 +00006467PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006468PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006470 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006473 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006474 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006475 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476
Ezio Melottie7f90372012-10-05 03:33:31 +03006477 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006478 escape.
6479
Ezio Melottie7f90372012-10-05 03:33:31 +03006480 For UCS1 strings it's '\xxx', 4 bytes per source character.
6481 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6482 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006483 */
6484
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006485 if (!PyUnicode_Check(unicode)) {
6486 PyErr_BadArgument();
6487 return NULL;
6488 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006489 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006490 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006491 }
Victor Stinner358af132015-10-12 22:36:57 +02006492
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006494 if (len == 0) {
6495 return PyBytes_FromStringAndSize(NULL, 0);
6496 }
6497
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006498 kind = PyUnicode_KIND(unicode);
6499 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006500 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6501 bytes, and 1 byte characters 4. */
6502 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006503 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006504 return PyErr_NoMemory();
6505 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006506 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006507 if (repr == NULL) {
6508 return NULL;
6509 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006510
Victor Stinner62ec3312016-09-06 17:04:34 -07006511 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006512 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006513 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006514
Victor Stinner62ec3312016-09-06 17:04:34 -07006515 /* U+0000-U+00ff range */
6516 if (ch < 0x100) {
6517 if (ch >= ' ' && ch < 127) {
6518 if (ch != '\\') {
6519 /* Copy printable US ASCII as-is */
6520 *p++ = (char) ch;
6521 }
6522 /* Escape backslashes */
6523 else {
6524 *p++ = '\\';
6525 *p++ = '\\';
6526 }
6527 }
Victor Stinner358af132015-10-12 22:36:57 +02006528
Victor Stinner62ec3312016-09-06 17:04:34 -07006529 /* Map special whitespace to '\t', \n', '\r' */
6530 else if (ch == '\t') {
6531 *p++ = '\\';
6532 *p++ = 't';
6533 }
6534 else if (ch == '\n') {
6535 *p++ = '\\';
6536 *p++ = 'n';
6537 }
6538 else if (ch == '\r') {
6539 *p++ = '\\';
6540 *p++ = 'r';
6541 }
6542
6543 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6544 else {
6545 *p++ = '\\';
6546 *p++ = 'x';
6547 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6548 *p++ = Py_hexdigits[ch & 0x000F];
6549 }
Tim Petersced69f82003-09-16 20:30:58 +00006550 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006551 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006552 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 *p++ = '\\';
6554 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006555 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6556 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6557 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6558 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006560 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6561 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006562
Victor Stinner62ec3312016-09-06 17:04:34 -07006563 /* Make sure that the first two digits are zero */
6564 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006565 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006566 *p++ = 'U';
6567 *p++ = '0';
6568 *p++ = '0';
6569 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6570 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6571 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6572 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6573 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6574 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577
Victor Stinner62ec3312016-09-06 17:04:34 -07006578 assert(p - PyBytes_AS_STRING(repr) > 0);
6579 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6580 return NULL;
6581 }
6582 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583}
6584
Alexander Belopolsky40018472011-02-26 01:02:56 +00006585PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006586PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6587 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006589 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006590 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006591 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006593 }
6594
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006595 result = PyUnicode_AsUnicodeEscapeString(tmp);
6596 Py_DECREF(tmp);
6597 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598}
6599
6600/* --- Raw Unicode Escape Codec ------------------------------------------- */
6601
Alexander Belopolsky40018472011-02-26 01:02:56 +00006602PyObject *
6603PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006604 Py_ssize_t size,
6605 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006607 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006608 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610 PyObject *errorHandler = NULL;
6611 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006612
Victor Stinner62ec3312016-09-06 17:04:34 -07006613 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006614 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006615 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 /* Escaped strings will always be longer than the resulting
6618 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006619 length after conversion to the true value. (But decoding error
6620 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006621 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006622 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006623 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6624 goto onError;
6625 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006626
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 end = s + size;
6628 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006629 unsigned char c = (unsigned char) *s++;
6630 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006631 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006632 Py_ssize_t startinpos;
6633 Py_ssize_t endinpos;
6634 const char *message;
6635
6636#define WRITE_CHAR(ch) \
6637 do { \
6638 if (ch <= writer.maxchar) { \
6639 assert(writer.pos < writer.size); \
6640 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6641 } \
6642 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6643 goto onError; \
6644 } \
6645 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006648 if (c != '\\' || s >= end) {
6649 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006651 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006652
Victor Stinner62ec3312016-09-06 17:04:34 -07006653 c = (unsigned char) *s++;
6654 if (c == 'u') {
6655 count = 4;
6656 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006658 else if (c == 'U') {
6659 count = 8;
6660 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006661 }
6662 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006663 assert(writer.pos < writer.size);
6664 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6665 WRITE_CHAR(c);
6666 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006667 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006668 startinpos = s - starts - 2;
6669
6670 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6671 for (ch = 0; count && s < end; ++s, --count) {
6672 c = (unsigned char)*s;
6673 ch <<= 4;
6674 if (c >= '0' && c <= '9') {
6675 ch += c - '0';
6676 }
6677 else if (c >= 'a' && c <= 'f') {
6678 ch += c - ('a' - 10);
6679 }
6680 else if (c >= 'A' && c <= 'F') {
6681 ch += c - ('A' - 10);
6682 }
6683 else {
6684 break;
6685 }
6686 }
6687 if (!count) {
6688 if (ch <= MAX_UNICODE) {
6689 WRITE_CHAR(ch);
6690 continue;
6691 }
6692 message = "\\Uxxxxxxxx out of range";
6693 }
6694
6695 endinpos = s-starts;
6696 writer.min_length = end - s + writer.pos;
6697 if (unicode_decode_call_errorhandler_writer(
6698 errors, &errorHandler,
6699 "rawunicodeescape", message,
6700 &starts, &end, &startinpos, &endinpos, &exc, &s,
6701 &writer)) {
6702 goto onError;
6703 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006704 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006705
6706#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 Py_XDECREF(errorHandler);
6709 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006710 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006711
Benjamin Peterson29060642009-01-31 22:14:21 +00006712 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006713 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 Py_XDECREF(errorHandler);
6715 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006717
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718}
6719
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006720
Alexander Belopolsky40018472011-02-26 01:02:56 +00006721PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006722PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723{
Victor Stinner62ec3312016-09-06 17:04:34 -07006724 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006726 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006727 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006728 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006729 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006731 if (!PyUnicode_Check(unicode)) {
6732 PyErr_BadArgument();
6733 return NULL;
6734 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006735 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006736 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006737 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006738 kind = PyUnicode_KIND(unicode);
6739 data = PyUnicode_DATA(unicode);
6740 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006741 if (kind == PyUnicode_1BYTE_KIND) {
6742 return PyBytes_FromStringAndSize(data, len);
6743 }
Victor Stinner0e368262011-11-10 20:12:49 +01006744
Victor Stinner62ec3312016-09-06 17:04:34 -07006745 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6746 bytes, and 1 byte characters 4. */
6747 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006748
Victor Stinner62ec3312016-09-06 17:04:34 -07006749 if (len > PY_SSIZE_T_MAX / expandsize) {
6750 return PyErr_NoMemory();
6751 }
6752 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6753 if (repr == NULL) {
6754 return NULL;
6755 }
6756 if (len == 0) {
6757 return repr;
6758 }
6759
6760 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006761 for (pos = 0; pos < len; pos++) {
6762 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006763
Victor Stinner62ec3312016-09-06 17:04:34 -07006764 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6765 if (ch < 0x100) {
6766 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006767 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006768 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006769 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 *p++ = '\\';
6771 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006772 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6773 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6774 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6775 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006777 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6778 else {
6779 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6780 *p++ = '\\';
6781 *p++ = 'U';
6782 *p++ = '0';
6783 *p++ = '0';
6784 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6785 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6786 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6787 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6788 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6789 *p++ = Py_hexdigits[ch & 15];
6790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006792
Victor Stinner62ec3312016-09-06 17:04:34 -07006793 assert(p > PyBytes_AS_STRING(repr));
6794 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6795 return NULL;
6796 }
6797 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798}
6799
Alexander Belopolsky40018472011-02-26 01:02:56 +00006800PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006801PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6802 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006804 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006805 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006806 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006807 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006808 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6809 Py_DECREF(tmp);
6810 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811}
6812
6813/* --- Latin-1 Codec ------------------------------------------------------ */
6814
Alexander Belopolsky40018472011-02-26 01:02:56 +00006815PyObject *
6816PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006817 Py_ssize_t size,
6818 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006821 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822}
6823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006824/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006825static void
6826make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006827 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006828 PyObject *unicode,
6829 Py_ssize_t startpos, Py_ssize_t endpos,
6830 const char *reason)
6831{
6832 if (*exceptionObject == NULL) {
6833 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006834 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006835 encoding, unicode, startpos, endpos, reason);
6836 }
6837 else {
6838 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6839 goto onError;
6840 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6841 goto onError;
6842 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6843 goto onError;
6844 return;
6845 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006846 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006847 }
6848}
6849
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006850/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006851static void
6852raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006853 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006854 PyObject *unicode,
6855 Py_ssize_t startpos, Py_ssize_t endpos,
6856 const char *reason)
6857{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006858 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006859 encoding, unicode, startpos, endpos, reason);
6860 if (*exceptionObject != NULL)
6861 PyCodec_StrictErrors(*exceptionObject);
6862}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006863
6864/* error handling callback helper:
6865 build arguments, call the callback and check the arguments,
6866 put the result into newpos and return the replacement string, which
6867 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006868static PyObject *
6869unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006870 PyObject **errorHandler,
6871 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006872 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006873 Py_ssize_t startpos, Py_ssize_t endpos,
6874 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006875{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006876 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006877 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006878 PyObject *restuple;
6879 PyObject *resunicode;
6880
6881 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006883 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006885 }
6886
Benjamin Petersonbac79492012-01-14 13:34:47 -05006887 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006888 return NULL;
6889 len = PyUnicode_GET_LENGTH(unicode);
6890
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006891 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006892 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006893 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006895
Petr Viktorinffd97532020-02-11 17:46:57 +01006896 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006897 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006900 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 Py_DECREF(restuple);
6902 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006903 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006904 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 &resunicode, newpos)) {
6906 Py_DECREF(restuple);
6907 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006908 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006909 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6910 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6911 Py_DECREF(restuple);
6912 return NULL;
6913 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006915 *newpos = len + *newpos;
6916 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006917 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 Py_DECREF(restuple);
6919 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006920 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006921 Py_INCREF(resunicode);
6922 Py_DECREF(restuple);
6923 return resunicode;
6924}
6925
Alexander Belopolsky40018472011-02-26 01:02:56 +00006926static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006927unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006928 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006929 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006931 /* input state */
6932 Py_ssize_t pos=0, size;
6933 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006934 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006935 /* pointer into the output */
6936 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006937 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6938 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006939 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006941 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006942 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006943 /* output object */
6944 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006945
Benjamin Petersonbac79492012-01-14 13:34:47 -05006946 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006947 return NULL;
6948 size = PyUnicode_GET_LENGTH(unicode);
6949 kind = PyUnicode_KIND(unicode);
6950 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006951 /* allocate enough for a simple encoding without
6952 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006953 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006954 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006955
6956 _PyBytesWriter_Init(&writer);
6957 str = _PyBytesWriter_Alloc(&writer, size);
6958 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006959 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006960
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006961 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006962 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006963
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006965 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006967 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006968 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006969 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006971 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006973 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006974 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006976
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006977 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006979
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006980 /* Only overallocate the buffer if it's not the last write */
6981 writer.overallocate = (collend < size);
6982
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006984 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006985 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006986
6987 switch (error_handler) {
6988 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006989 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006991
6992 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006993 memset(str, '?', collend - collstart);
6994 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006995 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006996 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006997 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 break;
Victor Stinner50149202015-09-22 00:26:54 +02006999
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007000 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007001 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007002 writer.min_size -= (collend - collstart);
7003 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007004 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007005 if (str == NULL)
7006 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007007 pos = collend;
7008 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007009
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007010 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007011 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007012 writer.min_size -= (collend - collstart);
7013 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007014 unicode, collstart, collend);
7015 if (str == NULL)
7016 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007017 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 break;
Victor Stinner50149202015-09-22 00:26:54 +02007019
Victor Stinnerc3713e92015-09-29 12:32:13 +02007020 case _Py_ERROR_SURROGATEESCAPE:
7021 for (i = collstart; i < collend; ++i) {
7022 ch = PyUnicode_READ(kind, data, i);
7023 if (ch < 0xdc80 || 0xdcff < ch) {
7024 /* Not a UTF-8b surrogate */
7025 break;
7026 }
7027 *str++ = (char)(ch - 0xdc00);
7028 ++pos;
7029 }
7030 if (i >= collend)
7031 break;
7032 collstart = pos;
7033 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007034 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007035
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007037 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7038 encoding, reason, unicode, &exc,
7039 collstart, collend, &newpos);
7040 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007042
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007043 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007044 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007045
Victor Stinner6bd525b2015-10-09 13:10:05 +02007046 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007047 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007048 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007049 PyBytes_AS_STRING(rep),
7050 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007051 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007052 else {
7053 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007054
Victor Stinner6bd525b2015-10-09 13:10:05 +02007055 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007057
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007058 if (limit == 256 ?
7059 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7060 !PyUnicode_IS_ASCII(rep))
7061 {
7062 /* Not all characters are smaller than limit */
7063 raise_encode_exception(&exc, encoding, unicode,
7064 collstart, collend, reason);
7065 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007067 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7068 str = _PyBytesWriter_WriteBytes(&writer, str,
7069 PyUnicode_DATA(rep),
7070 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007072 if (str == NULL)
7073 goto onError;
7074
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007075 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007076 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007077 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007078
7079 /* If overallocation was disabled, ensure that it was the last
7080 write. Otherwise, we missed an optimization */
7081 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007082 }
7083 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007084
Victor Stinner50149202015-09-22 00:26:54 +02007085 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007086 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007087 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007088
7089 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007090 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007091 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007092 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007093 Py_XDECREF(exc);
7094 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007095}
7096
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007097/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007098PyObject *
7099PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007100 Py_ssize_t size,
7101 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007103 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007104 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007105 if (unicode == NULL)
7106 return NULL;
7107 result = unicode_encode_ucs1(unicode, errors, 256);
7108 Py_DECREF(unicode);
7109 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110}
7111
Alexander Belopolsky40018472011-02-26 01:02:56 +00007112PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007113_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114{
7115 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 PyErr_BadArgument();
7117 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007119 if (PyUnicode_READY(unicode) == -1)
7120 return NULL;
7121 /* Fast path: if it is a one-byte string, construct
7122 bytes object directly. */
7123 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7124 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7125 PyUnicode_GET_LENGTH(unicode));
7126 /* Non-Latin-1 characters present. Defer to above function to
7127 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007128 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007129}
7130
7131PyObject*
7132PyUnicode_AsLatin1String(PyObject *unicode)
7133{
7134 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135}
7136
7137/* --- 7-bit ASCII Codec -------------------------------------------------- */
7138
Alexander Belopolsky40018472011-02-26 01:02:56 +00007139PyObject *
7140PyUnicode_DecodeASCII(const char *s,
7141 Py_ssize_t size,
7142 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007144 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007145 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007146 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007147 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007148 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007149
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007151 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007152
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007154 if (size == 1 && (unsigned char)s[0] < 128)
7155 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007156
Inada Naoki770847a2019-06-24 12:30:24 +09007157 // Shortcut for simple case
7158 PyObject *u = PyUnicode_New(size, 127);
7159 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007160 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007161 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007162 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007163 if (outpos == size) {
7164 return u;
7165 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007166
Inada Naoki770847a2019-06-24 12:30:24 +09007167 _PyUnicodeWriter writer;
7168 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007169 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007170
Inada Naoki770847a2019-06-24 12:30:24 +09007171 s += outpos;
7172 int kind = writer.kind;
7173 void *data = writer.data;
7174 Py_ssize_t startinpos, endinpos;
7175
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007176 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007177 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007179 PyUnicode_WRITE(kind, data, writer.pos, c);
7180 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007182 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007183 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007184
7185 /* byte outsize range 0x00..0x7f: call the error handler */
7186
7187 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007188 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007189
7190 switch (error_handler)
7191 {
7192 case _Py_ERROR_REPLACE:
7193 case _Py_ERROR_SURROGATEESCAPE:
7194 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007195 but we may switch to UCS2 at the first write */
7196 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7197 goto onError;
7198 kind = writer.kind;
7199 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007200
7201 if (error_handler == _Py_ERROR_REPLACE)
7202 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7203 else
7204 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7205 writer.pos++;
7206 ++s;
7207 break;
7208
7209 case _Py_ERROR_IGNORE:
7210 ++s;
7211 break;
7212
7213 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 startinpos = s-starts;
7215 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007216 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007217 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 "ascii", "ordinal not in range(128)",
7219 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007220 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007222 kind = writer.kind;
7223 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007226 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007227 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007228 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007229
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007231 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007232 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007233 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234 return NULL;
7235}
7236
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007237/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007238PyObject *
7239PyUnicode_EncodeASCII(const Py_UNICODE *p,
7240 Py_ssize_t size,
7241 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007243 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007244 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007245 if (unicode == NULL)
7246 return NULL;
7247 result = unicode_encode_ucs1(unicode, errors, 128);
7248 Py_DECREF(unicode);
7249 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250}
7251
Alexander Belopolsky40018472011-02-26 01:02:56 +00007252PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007253_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254{
7255 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007256 PyErr_BadArgument();
7257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007259 if (PyUnicode_READY(unicode) == -1)
7260 return NULL;
7261 /* Fast path: if it is an ASCII-only string, construct bytes object
7262 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007263 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007264 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7265 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007266 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007267}
7268
7269PyObject *
7270PyUnicode_AsASCIIString(PyObject *unicode)
7271{
7272 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273}
7274
Steve Dowercc16be82016-09-08 10:35:16 -07007275#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007276
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007277/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007278
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007279#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007280#define NEED_RETRY
7281#endif
7282
Steve Dower7ebdda02019-08-21 16:22:33 -07007283/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7284 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7285 both cases also and avoids partial characters overrunning the
7286 length limit in MultiByteToWideChar on Windows */
7287#define DECODING_CHUNK_SIZE (INT_MAX/4)
7288
Victor Stinner3a50e702011-10-18 21:21:00 +02007289#ifndef WC_ERR_INVALID_CHARS
7290# define WC_ERR_INVALID_CHARS 0x0080
7291#endif
7292
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007293static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007294code_page_name(UINT code_page, PyObject **obj)
7295{
7296 *obj = NULL;
7297 if (code_page == CP_ACP)
7298 return "mbcs";
7299 if (code_page == CP_UTF7)
7300 return "CP_UTF7";
7301 if (code_page == CP_UTF8)
7302 return "CP_UTF8";
7303
7304 *obj = PyBytes_FromFormat("cp%u", code_page);
7305 if (*obj == NULL)
7306 return NULL;
7307 return PyBytes_AS_STRING(*obj);
7308}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309
Victor Stinner3a50e702011-10-18 21:21:00 +02007310static DWORD
7311decode_code_page_flags(UINT code_page)
7312{
7313 if (code_page == CP_UTF7) {
7314 /* The CP_UTF7 decoder only supports flags=0 */
7315 return 0;
7316 }
7317 else
7318 return MB_ERR_INVALID_CHARS;
7319}
7320
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007322 * Decode a byte string from a Windows code page into unicode object in strict
7323 * mode.
7324 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007325 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7326 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007327 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007328static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007329decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007330 wchar_t **buf,
7331 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 const char *in,
7333 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007334{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007335 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007336 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007337 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338
7339 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007340 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007341 while ((outsize = MultiByteToWideChar(code_page, flags,
7342 in, insize, NULL, 0)) <= 0)
7343 {
7344 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7345 goto error;
7346 }
7347 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7348 flags = 0;
7349 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007350
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007351 /* Extend a wchar_t* buffer */
7352 Py_ssize_t n = *bufsize; /* Get the current length */
7353 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7354 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007355 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007356 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007357
7358 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007359 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7360 if (outsize <= 0)
7361 goto error;
7362 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007363
Victor Stinner3a50e702011-10-18 21:21:00 +02007364error:
7365 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7366 return -2;
7367 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007368 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007369}
7370
Victor Stinner3a50e702011-10-18 21:21:00 +02007371/*
7372 * Decode a byte string from a code page into unicode object with an error
7373 * handler.
7374 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007375 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 * UnicodeDecodeError exception and returns -1 on error.
7377 */
7378static int
7379decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007380 wchar_t **buf,
7381 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007382 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007383 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007384{
7385 const char *startin = in;
7386 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007387 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 /* Ideally, we should get reason from FormatMessage. This is the Windows
7389 2000 English version of the message. */
7390 const char *reason = "No mapping for the Unicode character exists "
7391 "in the target code page.";
7392 /* each step cannot decode more than 1 character, but a character can be
7393 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007394 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007395 int insize;
7396 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 PyObject *errorHandler = NULL;
7398 PyObject *exc = NULL;
7399 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007400 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007401 DWORD err;
7402 int ret = -1;
7403
7404 assert(size > 0);
7405
7406 encoding = code_page_name(code_page, &encoding_obj);
7407 if (encoding == NULL)
7408 return -1;
7409
Victor Stinner7d00cc12014-03-17 23:08:06 +01007410 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7412 UnicodeDecodeError. */
7413 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7414 if (exc != NULL) {
7415 PyCodec_StrictErrors(exc);
7416 Py_CLEAR(exc);
7417 }
7418 goto error;
7419 }
7420
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007421 /* Extend a wchar_t* buffer */
7422 Py_ssize_t n = *bufsize; /* Get the current length */
7423 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7424 PyErr_NoMemory();
7425 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007427 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7428 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007430 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007431
7432 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 while (in < endin)
7434 {
7435 /* Decode a character */
7436 insize = 1;
7437 do
7438 {
7439 outsize = MultiByteToWideChar(code_page, flags,
7440 in, insize,
7441 buffer, Py_ARRAY_LENGTH(buffer));
7442 if (outsize > 0)
7443 break;
7444 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007445 if (err == ERROR_INVALID_FLAGS && flags) {
7446 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7447 flags = 0;
7448 continue;
7449 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 if (err != ERROR_NO_UNICODE_TRANSLATION
7451 && err != ERROR_INSUFFICIENT_BUFFER)
7452 {
7453 PyErr_SetFromWindowsErr(0);
7454 goto error;
7455 }
7456 insize++;
7457 }
7458 /* 4=maximum length of a UTF-8 sequence */
7459 while (insize <= 4 && (in + insize) <= endin);
7460
7461 if (outsize <= 0) {
7462 Py_ssize_t startinpos, endinpos, outpos;
7463
Victor Stinner7d00cc12014-03-17 23:08:06 +01007464 /* last character in partial decode? */
7465 if (in + insize >= endin && !final)
7466 break;
7467
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 startinpos = in - startin;
7469 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007470 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007471 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 errors, &errorHandler,
7473 encoding, reason,
7474 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007475 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 {
7477 goto error;
7478 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007479 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 }
7481 else {
7482 in += insize;
7483 memcpy(out, buffer, outsize * sizeof(wchar_t));
7484 out += outsize;
7485 }
7486 }
7487
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007488 /* Shrink the buffer */
7489 assert(out - *buf <= *bufsize);
7490 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007491 /* (in - startin) <= size and size is an int */
7492 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007493
7494error:
7495 Py_XDECREF(encoding_obj);
7496 Py_XDECREF(errorHandler);
7497 Py_XDECREF(exc);
7498 return ret;
7499}
7500
Victor Stinner3a50e702011-10-18 21:21:00 +02007501static PyObject *
7502decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007503 const char *s, Py_ssize_t size,
7504 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007505{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007506 wchar_t *buf = NULL;
7507 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007508 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007509
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 if (code_page < 0) {
7511 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7512 return NULL;
7513 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007514 if (size < 0) {
7515 PyErr_BadInternalCall();
7516 return NULL;
7517 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007518
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007519 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007521
Victor Stinner76a31a62011-11-04 00:05:13 +01007522 do
7523 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007524#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007525 if (size > DECODING_CHUNK_SIZE) {
7526 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007527 final = 0;
7528 done = 0;
7529 }
7530 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007531#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007532 {
7533 chunk_size = (int)size;
7534 final = (consumed == NULL);
7535 done = 1;
7536 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007537
Victor Stinner76a31a62011-11-04 00:05:13 +01007538 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007539 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007540 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007541 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007542 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007543
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007544 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007545 s, chunk_size);
7546 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007547 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007548 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007549 errors, final);
7550 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007551
7552 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007553 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007554 return NULL;
7555 }
7556
7557 if (consumed)
7558 *consumed += converted;
7559
7560 s += converted;
7561 size -= converted;
7562 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007563
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007564 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7565 PyMem_Free(buf);
7566 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007567}
7568
Alexander Belopolsky40018472011-02-26 01:02:56 +00007569PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007570PyUnicode_DecodeCodePageStateful(int code_page,
7571 const char *s,
7572 Py_ssize_t size,
7573 const char *errors,
7574 Py_ssize_t *consumed)
7575{
7576 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7577}
7578
7579PyObject *
7580PyUnicode_DecodeMBCSStateful(const char *s,
7581 Py_ssize_t size,
7582 const char *errors,
7583 Py_ssize_t *consumed)
7584{
7585 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7586}
7587
7588PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007589PyUnicode_DecodeMBCS(const char *s,
7590 Py_ssize_t size,
7591 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007592{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007593 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7594}
7595
Victor Stinner3a50e702011-10-18 21:21:00 +02007596static DWORD
7597encode_code_page_flags(UINT code_page, const char *errors)
7598{
7599 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007600 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 }
7602 else if (code_page == CP_UTF7) {
7603 /* CP_UTF7 only supports flags=0 */
7604 return 0;
7605 }
7606 else {
7607 if (errors != NULL && strcmp(errors, "replace") == 0)
7608 return 0;
7609 else
7610 return WC_NO_BEST_FIT_CHARS;
7611 }
7612}
7613
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007614/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007615 * Encode a Unicode string to a Windows code page into a byte string in strict
7616 * mode.
7617 *
7618 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007619 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007620 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007621static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007622encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007623 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007625{
Victor Stinner554f3f02010-06-16 23:33:54 +00007626 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 BOOL *pusedDefaultChar = &usedDefaultChar;
7628 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007629 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007630 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 const DWORD flags = encode_code_page_flags(code_page, NULL);
7632 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007633 /* Create a substring so that we can get the UTF-16 representation
7634 of just the slice under consideration. */
7635 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007636
Martin v. Löwis3d325192011-11-04 18:23:06 +01007637 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007638
Victor Stinner3a50e702011-10-18 21:21:00 +02007639 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007640 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007641 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007642 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007643
Victor Stinner2fc507f2011-11-04 20:06:39 +01007644 substring = PyUnicode_Substring(unicode, offset, offset+len);
7645 if (substring == NULL)
7646 return -1;
7647 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7648 if (p == NULL) {
7649 Py_DECREF(substring);
7650 return -1;
7651 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007652 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007653
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007654 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007655 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007656 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007657 NULL, 0,
7658 NULL, pusedDefaultChar);
7659 if (outsize <= 0)
7660 goto error;
7661 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007662 if (pusedDefaultChar && *pusedDefaultChar) {
7663 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007665 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007666
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007669 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007670 if (*outbytes == NULL) {
7671 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007673 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007674 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007675 }
7676 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007678 const Py_ssize_t n = PyBytes_Size(*outbytes);
7679 if (outsize > PY_SSIZE_T_MAX - n) {
7680 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007681 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007683 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007684 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7685 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007686 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007687 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007688 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007689 }
7690
7691 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007692 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007693 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 out, outsize,
7695 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007696 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007697 if (outsize <= 0)
7698 goto error;
7699 if (pusedDefaultChar && *pusedDefaultChar)
7700 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007701 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007702
Victor Stinner3a50e702011-10-18 21:21:00 +02007703error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007704 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007705 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7706 return -2;
7707 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007708 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007709}
7710
Victor Stinner3a50e702011-10-18 21:21:00 +02007711/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007712 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007713 * error handler.
7714 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007715 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007716 * -1 on other error.
7717 */
7718static int
7719encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007720 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007721 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007722{
Victor Stinner3a50e702011-10-18 21:21:00 +02007723 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007724 Py_ssize_t pos = unicode_offset;
7725 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007726 /* Ideally, we should get reason from FormatMessage. This is the Windows
7727 2000 English version of the message. */
7728 const char *reason = "invalid character";
7729 /* 4=maximum length of a UTF-8 sequence */
7730 char buffer[4];
7731 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7732 Py_ssize_t outsize;
7733 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007734 PyObject *errorHandler = NULL;
7735 PyObject *exc = NULL;
7736 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007737 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007738 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007739 PyObject *rep;
7740 int ret = -1;
7741
7742 assert(insize > 0);
7743
7744 encoding = code_page_name(code_page, &encoding_obj);
7745 if (encoding == NULL)
7746 return -1;
7747
7748 if (errors == NULL || strcmp(errors, "strict") == 0) {
7749 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7750 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007751 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007752 if (exc != NULL) {
7753 PyCodec_StrictErrors(exc);
7754 Py_DECREF(exc);
7755 }
7756 Py_XDECREF(encoding_obj);
7757 return -1;
7758 }
7759
7760 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7761 pusedDefaultChar = &usedDefaultChar;
7762 else
7763 pusedDefaultChar = NULL;
7764
7765 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7766 PyErr_NoMemory();
7767 goto error;
7768 }
7769 outsize = insize * Py_ARRAY_LENGTH(buffer);
7770
7771 if (*outbytes == NULL) {
7772 /* Create string object */
7773 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7774 if (*outbytes == NULL)
7775 goto error;
7776 out = PyBytes_AS_STRING(*outbytes);
7777 }
7778 else {
7779 /* Extend string object */
7780 Py_ssize_t n = PyBytes_Size(*outbytes);
7781 if (n > PY_SSIZE_T_MAX - outsize) {
7782 PyErr_NoMemory();
7783 goto error;
7784 }
7785 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7786 goto error;
7787 out = PyBytes_AS_STRING(*outbytes) + n;
7788 }
7789
7790 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007791 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007792 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007793 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7794 wchar_t chars[2];
7795 int charsize;
7796 if (ch < 0x10000) {
7797 chars[0] = (wchar_t)ch;
7798 charsize = 1;
7799 }
7800 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007801 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7802 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007803 charsize = 2;
7804 }
7805
Victor Stinner3a50e702011-10-18 21:21:00 +02007806 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007807 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007808 buffer, Py_ARRAY_LENGTH(buffer),
7809 NULL, pusedDefaultChar);
7810 if (outsize > 0) {
7811 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7812 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007813 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007814 memcpy(out, buffer, outsize);
7815 out += outsize;
7816 continue;
7817 }
7818 }
7819 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7820 PyErr_SetFromWindowsErr(0);
7821 goto error;
7822 }
7823
Victor Stinner3a50e702011-10-18 21:21:00 +02007824 rep = unicode_encode_call_errorhandler(
7825 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007826 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007827 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007828 if (rep == NULL)
7829 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007830 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007831
7832 if (PyBytes_Check(rep)) {
7833 outsize = PyBytes_GET_SIZE(rep);
7834 if (outsize != 1) {
7835 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7836 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7837 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7838 Py_DECREF(rep);
7839 goto error;
7840 }
7841 out = PyBytes_AS_STRING(*outbytes) + offset;
7842 }
7843 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7844 out += outsize;
7845 }
7846 else {
7847 Py_ssize_t i;
7848 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007849 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007850
Benjamin Petersonbac79492012-01-14 13:34:47 -05007851 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007852 Py_DECREF(rep);
7853 goto error;
7854 }
7855
7856 outsize = PyUnicode_GET_LENGTH(rep);
7857 if (outsize != 1) {
7858 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7859 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7860 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7861 Py_DECREF(rep);
7862 goto error;
7863 }
7864 out = PyBytes_AS_STRING(*outbytes) + offset;
7865 }
7866 kind = PyUnicode_KIND(rep);
7867 data = PyUnicode_DATA(rep);
7868 for (i=0; i < outsize; i++) {
7869 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7870 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007871 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007872 encoding, unicode,
7873 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007874 "unable to encode error handler result to ASCII");
7875 Py_DECREF(rep);
7876 goto error;
7877 }
7878 *out = (unsigned char)ch;
7879 out++;
7880 }
7881 }
7882 Py_DECREF(rep);
7883 }
7884 /* write a NUL byte */
7885 *out = 0;
7886 outsize = out - PyBytes_AS_STRING(*outbytes);
7887 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7888 if (_PyBytes_Resize(outbytes, outsize) < 0)
7889 goto error;
7890 ret = 0;
7891
7892error:
7893 Py_XDECREF(encoding_obj);
7894 Py_XDECREF(errorHandler);
7895 Py_XDECREF(exc);
7896 return ret;
7897}
7898
Victor Stinner3a50e702011-10-18 21:21:00 +02007899static PyObject *
7900encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007901 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007902 const char *errors)
7903{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007904 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007905 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007906 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007907 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007908
Victor Stinner29dacf22015-01-26 16:41:32 +01007909 if (!PyUnicode_Check(unicode)) {
7910 PyErr_BadArgument();
7911 return NULL;
7912 }
7913
Benjamin Petersonbac79492012-01-14 13:34:47 -05007914 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007915 return NULL;
7916 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007917
Victor Stinner3a50e702011-10-18 21:21:00 +02007918 if (code_page < 0) {
7919 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7920 return NULL;
7921 }
7922
Martin v. Löwis3d325192011-11-04 18:23:06 +01007923 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007924 return PyBytes_FromStringAndSize(NULL, 0);
7925
Victor Stinner7581cef2011-11-03 22:32:33 +01007926 offset = 0;
7927 do
7928 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007929#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007930 if (len > DECODING_CHUNK_SIZE) {
7931 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007932 done = 0;
7933 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007934 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007935#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007936 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007937 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007938 done = 1;
7939 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007940
Victor Stinner76a31a62011-11-04 00:05:13 +01007941 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007942 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007943 errors);
7944 if (ret == -2)
7945 ret = encode_code_page_errors(code_page, &outbytes,
7946 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007947 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007948 if (ret < 0) {
7949 Py_XDECREF(outbytes);
7950 return NULL;
7951 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007952
Victor Stinner7581cef2011-11-03 22:32:33 +01007953 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007954 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007955 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007956
Victor Stinner3a50e702011-10-18 21:21:00 +02007957 return outbytes;
7958}
7959
7960PyObject *
7961PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7962 Py_ssize_t size,
7963 const char *errors)
7964{
Victor Stinner7581cef2011-11-03 22:32:33 +01007965 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007966 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007967 if (unicode == NULL)
7968 return NULL;
7969 res = encode_code_page(CP_ACP, unicode, errors);
7970 Py_DECREF(unicode);
7971 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007972}
7973
7974PyObject *
7975PyUnicode_EncodeCodePage(int code_page,
7976 PyObject *unicode,
7977 const char *errors)
7978{
Victor Stinner7581cef2011-11-03 22:32:33 +01007979 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007980}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007981
Alexander Belopolsky40018472011-02-26 01:02:56 +00007982PyObject *
7983PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007984{
Victor Stinner7581cef2011-11-03 22:32:33 +01007985 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007986}
7987
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007988#undef NEED_RETRY
7989
Steve Dowercc16be82016-09-08 10:35:16 -07007990#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007991
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992/* --- Character Mapping Codec -------------------------------------------- */
7993
Victor Stinnerfb161b12013-04-18 01:44:27 +02007994static int
7995charmap_decode_string(const char *s,
7996 Py_ssize_t size,
7997 PyObject *mapping,
7998 const char *errors,
7999 _PyUnicodeWriter *writer)
8000{
8001 const char *starts = s;
8002 const char *e;
8003 Py_ssize_t startinpos, endinpos;
8004 PyObject *errorHandler = NULL, *exc = NULL;
8005 Py_ssize_t maplen;
8006 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008007 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008008 Py_UCS4 x;
8009 unsigned char ch;
8010
8011 if (PyUnicode_READY(mapping) == -1)
8012 return -1;
8013
8014 maplen = PyUnicode_GET_LENGTH(mapping);
8015 mapdata = PyUnicode_DATA(mapping);
8016 mapkind = PyUnicode_KIND(mapping);
8017
8018 e = s + size;
8019
8020 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8021 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8022 * is disabled in encoding aliases, latin1 is preferred because
8023 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008024 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008025 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8026 Py_UCS4 maxchar = writer->maxchar;
8027
8028 assert (writer->kind == PyUnicode_1BYTE_KIND);
8029 while (s < e) {
8030 ch = *s;
8031 x = mapdata_ucs1[ch];
8032 if (x > maxchar) {
8033 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8034 goto onError;
8035 maxchar = writer->maxchar;
8036 outdata = (Py_UCS1 *)writer->data;
8037 }
8038 outdata[writer->pos] = x;
8039 writer->pos++;
8040 ++s;
8041 }
8042 return 0;
8043 }
8044
8045 while (s < e) {
8046 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8047 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008048 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008049 if (outkind == PyUnicode_1BYTE_KIND) {
8050 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8051 Py_UCS4 maxchar = writer->maxchar;
8052 while (s < e) {
8053 ch = *s;
8054 x = mapdata_ucs2[ch];
8055 if (x > maxchar)
8056 goto Error;
8057 outdata[writer->pos] = x;
8058 writer->pos++;
8059 ++s;
8060 }
8061 break;
8062 }
8063 else if (outkind == PyUnicode_2BYTE_KIND) {
8064 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8065 while (s < e) {
8066 ch = *s;
8067 x = mapdata_ucs2[ch];
8068 if (x == 0xFFFE)
8069 goto Error;
8070 outdata[writer->pos] = x;
8071 writer->pos++;
8072 ++s;
8073 }
8074 break;
8075 }
8076 }
8077 ch = *s;
8078
8079 if (ch < maplen)
8080 x = PyUnicode_READ(mapkind, mapdata, ch);
8081 else
8082 x = 0xfffe; /* invalid value */
8083Error:
8084 if (x == 0xfffe)
8085 {
8086 /* undefined mapping */
8087 startinpos = s-starts;
8088 endinpos = startinpos+1;
8089 if (unicode_decode_call_errorhandler_writer(
8090 errors, &errorHandler,
8091 "charmap", "character maps to <undefined>",
8092 &starts, &e, &startinpos, &endinpos, &exc, &s,
8093 writer)) {
8094 goto onError;
8095 }
8096 continue;
8097 }
8098
8099 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8100 goto onError;
8101 ++s;
8102 }
8103 Py_XDECREF(errorHandler);
8104 Py_XDECREF(exc);
8105 return 0;
8106
8107onError:
8108 Py_XDECREF(errorHandler);
8109 Py_XDECREF(exc);
8110 return -1;
8111}
8112
8113static int
8114charmap_decode_mapping(const char *s,
8115 Py_ssize_t size,
8116 PyObject *mapping,
8117 const char *errors,
8118 _PyUnicodeWriter *writer)
8119{
8120 const char *starts = s;
8121 const char *e;
8122 Py_ssize_t startinpos, endinpos;
8123 PyObject *errorHandler = NULL, *exc = NULL;
8124 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008125 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008126
8127 e = s + size;
8128
8129 while (s < e) {
8130 ch = *s;
8131
8132 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8133 key = PyLong_FromLong((long)ch);
8134 if (key == NULL)
8135 goto onError;
8136
8137 item = PyObject_GetItem(mapping, key);
8138 Py_DECREF(key);
8139 if (item == NULL) {
8140 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8141 /* No mapping found means: mapping is undefined. */
8142 PyErr_Clear();
8143 goto Undefined;
8144 } else
8145 goto onError;
8146 }
8147
8148 /* Apply mapping */
8149 if (item == Py_None)
8150 goto Undefined;
8151 if (PyLong_Check(item)) {
8152 long value = PyLong_AS_LONG(item);
8153 if (value == 0xFFFE)
8154 goto Undefined;
8155 if (value < 0 || value > MAX_UNICODE) {
8156 PyErr_Format(PyExc_TypeError,
8157 "character mapping must be in range(0x%lx)",
8158 (unsigned long)MAX_UNICODE + 1);
8159 goto onError;
8160 }
8161
8162 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8163 goto onError;
8164 }
8165 else if (PyUnicode_Check(item)) {
8166 if (PyUnicode_READY(item) == -1)
8167 goto onError;
8168 if (PyUnicode_GET_LENGTH(item) == 1) {
8169 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8170 if (value == 0xFFFE)
8171 goto Undefined;
8172 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8173 goto onError;
8174 }
8175 else {
8176 writer->overallocate = 1;
8177 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8178 goto onError;
8179 }
8180 }
8181 else {
8182 /* wrong return value */
8183 PyErr_SetString(PyExc_TypeError,
8184 "character mapping must return integer, None or str");
8185 goto onError;
8186 }
8187 Py_CLEAR(item);
8188 ++s;
8189 continue;
8190
8191Undefined:
8192 /* undefined mapping */
8193 Py_CLEAR(item);
8194 startinpos = s-starts;
8195 endinpos = startinpos+1;
8196 if (unicode_decode_call_errorhandler_writer(
8197 errors, &errorHandler,
8198 "charmap", "character maps to <undefined>",
8199 &starts, &e, &startinpos, &endinpos, &exc, &s,
8200 writer)) {
8201 goto onError;
8202 }
8203 }
8204 Py_XDECREF(errorHandler);
8205 Py_XDECREF(exc);
8206 return 0;
8207
8208onError:
8209 Py_XDECREF(item);
8210 Py_XDECREF(errorHandler);
8211 Py_XDECREF(exc);
8212 return -1;
8213}
8214
Alexander Belopolsky40018472011-02-26 01:02:56 +00008215PyObject *
8216PyUnicode_DecodeCharmap(const char *s,
8217 Py_ssize_t size,
8218 PyObject *mapping,
8219 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008221 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008222
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223 /* Default to Latin-1 */
8224 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008228 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008229 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008230 writer.min_length = size;
8231 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008233
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008234 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008235 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8236 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008237 }
8238 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008239 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8240 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008242 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008243
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008245 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 return NULL;
8247}
8248
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008249/* Charmap encoding: the lookup table */
8250
Alexander Belopolsky40018472011-02-26 01:02:56 +00008251struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 PyObject_HEAD
8253 unsigned char level1[32];
8254 int count2, count3;
8255 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008256};
8257
8258static PyObject*
8259encoding_map_size(PyObject *obj, PyObject* args)
8260{
8261 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008262 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264}
8265
8266static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008267 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 PyDoc_STR("Return the size (in bytes) of this object") },
8269 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008270};
8271
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008272static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008273 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 "EncodingMap", /*tp_name*/
8275 sizeof(struct encoding_map), /*tp_basicsize*/
8276 0, /*tp_itemsize*/
8277 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008278 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008279 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 0, /*tp_getattr*/
8281 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008282 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 0, /*tp_repr*/
8284 0, /*tp_as_number*/
8285 0, /*tp_as_sequence*/
8286 0, /*tp_as_mapping*/
8287 0, /*tp_hash*/
8288 0, /*tp_call*/
8289 0, /*tp_str*/
8290 0, /*tp_getattro*/
8291 0, /*tp_setattro*/
8292 0, /*tp_as_buffer*/
8293 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8294 0, /*tp_doc*/
8295 0, /*tp_traverse*/
8296 0, /*tp_clear*/
8297 0, /*tp_richcompare*/
8298 0, /*tp_weaklistoffset*/
8299 0, /*tp_iter*/
8300 0, /*tp_iternext*/
8301 encoding_map_methods, /*tp_methods*/
8302 0, /*tp_members*/
8303 0, /*tp_getset*/
8304 0, /*tp_base*/
8305 0, /*tp_dict*/
8306 0, /*tp_descr_get*/
8307 0, /*tp_descr_set*/
8308 0, /*tp_dictoffset*/
8309 0, /*tp_init*/
8310 0, /*tp_alloc*/
8311 0, /*tp_new*/
8312 0, /*tp_free*/
8313 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008314};
8315
8316PyObject*
8317PyUnicode_BuildEncodingMap(PyObject* string)
8318{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008319 PyObject *result;
8320 struct encoding_map *mresult;
8321 int i;
8322 int need_dict = 0;
8323 unsigned char level1[32];
8324 unsigned char level2[512];
8325 unsigned char *mlevel1, *mlevel2, *mlevel3;
8326 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008328 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008329 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008330 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008331
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008332 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 PyErr_BadArgument();
8334 return NULL;
8335 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008336 kind = PyUnicode_KIND(string);
8337 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008338 length = PyUnicode_GET_LENGTH(string);
8339 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008340 memset(level1, 0xFF, sizeof level1);
8341 memset(level2, 0xFF, sizeof level2);
8342
8343 /* If there isn't a one-to-one mapping of NULL to \0,
8344 or if there are non-BMP characters, we need to use
8345 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008348 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008349 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 ch = PyUnicode_READ(kind, data, i);
8351 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008352 need_dict = 1;
8353 break;
8354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356 /* unmapped character */
8357 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008358 l1 = ch >> 11;
8359 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008360 if (level1[l1] == 0xFF)
8361 level1[l1] = count2++;
8362 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008363 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008364 }
8365
8366 if (count2 >= 0xFF || count3 >= 0xFF)
8367 need_dict = 1;
8368
8369 if (need_dict) {
8370 PyObject *result = PyDict_New();
8371 PyObject *key, *value;
8372 if (!result)
8373 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008374 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008376 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008377 if (!key || !value)
8378 goto failed1;
8379 if (PyDict_SetItem(result, key, value) == -1)
8380 goto failed1;
8381 Py_DECREF(key);
8382 Py_DECREF(value);
8383 }
8384 return result;
8385 failed1:
8386 Py_XDECREF(key);
8387 Py_XDECREF(value);
8388 Py_DECREF(result);
8389 return NULL;
8390 }
8391
8392 /* Create a three-level trie */
8393 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8394 16*count2 + 128*count3 - 1);
8395 if (!result)
8396 return PyErr_NoMemory();
8397 PyObject_Init(result, &EncodingMapType);
8398 mresult = (struct encoding_map*)result;
8399 mresult->count2 = count2;
8400 mresult->count3 = count3;
8401 mlevel1 = mresult->level1;
8402 mlevel2 = mresult->level23;
8403 mlevel3 = mresult->level23 + 16*count2;
8404 memcpy(mlevel1, level1, 32);
8405 memset(mlevel2, 0xFF, 16*count2);
8406 memset(mlevel3, 0, 128*count3);
8407 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008408 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008409 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008410 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8411 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008412 /* unmapped character */
8413 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008414 o1 = ch>>11;
8415 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008416 i2 = 16*mlevel1[o1] + o2;
8417 if (mlevel2[i2] == 0xFF)
8418 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008419 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008420 i3 = 128*mlevel2[i2] + o3;
8421 mlevel3[i3] = i;
8422 }
8423 return result;
8424}
8425
8426static int
Victor Stinner22168992011-11-20 17:09:18 +01008427encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008428{
8429 struct encoding_map *map = (struct encoding_map*)mapping;
8430 int l1 = c>>11;
8431 int l2 = (c>>7) & 0xF;
8432 int l3 = c & 0x7F;
8433 int i;
8434
Victor Stinner22168992011-11-20 17:09:18 +01008435 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008437 if (c == 0)
8438 return 0;
8439 /* level 1*/
8440 i = map->level1[l1];
8441 if (i == 0xFF) {
8442 return -1;
8443 }
8444 /* level 2*/
8445 i = map->level23[16*i+l2];
8446 if (i == 0xFF) {
8447 return -1;
8448 }
8449 /* level 3 */
8450 i = map->level23[16*map->count2 + 128*i + l3];
8451 if (i == 0) {
8452 return -1;
8453 }
8454 return i;
8455}
8456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457/* Lookup the character ch in the mapping. If the character
8458 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008459 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008460static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008461charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462{
Christian Heimes217cfd12007-12-02 14:31:20 +00008463 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464 PyObject *x;
8465
8466 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008468 x = PyObject_GetItem(mapping, w);
8469 Py_DECREF(w);
8470 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8472 /* No mapping found means: mapping is undefined. */
8473 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008474 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 } else
8476 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008478 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008480 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 long value = PyLong_AS_LONG(x);
8482 if (value < 0 || value > 255) {
8483 PyErr_SetString(PyExc_TypeError,
8484 "character mapping must be in range(256)");
8485 Py_DECREF(x);
8486 return NULL;
8487 }
8488 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008490 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 /* wrong return value */
8494 PyErr_Format(PyExc_TypeError,
8495 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008496 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 Py_DECREF(x);
8498 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 }
8500}
8501
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008502static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008503charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008504{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008505 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8506 /* exponentially overallocate to minimize reallocations */
8507 if (requiredsize < 2*outsize)
8508 requiredsize = 2*outsize;
8509 if (_PyBytes_Resize(outobj, requiredsize))
8510 return -1;
8511 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008512}
8513
Benjamin Peterson14339b62009-01-31 16:36:08 +00008514typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008516} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008518 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 space is available. Return a new reference to the object that
8520 was put in the output buffer, or Py_None, if the mapping was undefined
8521 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008522 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008523static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008524charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008525 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008527 PyObject *rep;
8528 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008529 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530
Andy Lesterdffe4c02020-03-04 07:15:20 -06008531 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008532 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008534 if (res == -1)
8535 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 if (outsize<requiredsize)
8537 if (charmapencode_resize(outobj, outpos, requiredsize))
8538 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008539 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 outstart[(*outpos)++] = (char)res;
8541 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008542 }
8543
8544 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008547 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 Py_DECREF(rep);
8549 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008550 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 if (PyLong_Check(rep)) {
8552 Py_ssize_t requiredsize = *outpos+1;
8553 if (outsize<requiredsize)
8554 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8555 Py_DECREF(rep);
8556 return enc_EXCEPTION;
8557 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008558 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008560 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 else {
8562 const char *repchars = PyBytes_AS_STRING(rep);
8563 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8564 Py_ssize_t requiredsize = *outpos+repsize;
8565 if (outsize<requiredsize)
8566 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8567 Py_DECREF(rep);
8568 return enc_EXCEPTION;
8569 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008570 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 memcpy(outstart + *outpos, repchars, repsize);
8572 *outpos += repsize;
8573 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008575 Py_DECREF(rep);
8576 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008577}
8578
8579/* handle an error in PyUnicode_EncodeCharmap
8580 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008581static int
8582charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008583 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008584 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008585 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008586 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587{
8588 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008589 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008590 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008591 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008592 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008593 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008595 Py_ssize_t collstartpos = *inpos;
8596 Py_ssize_t collendpos = *inpos+1;
8597 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008598 const char *encoding = "charmap";
8599 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008600 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008601 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008602 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008603
Benjamin Petersonbac79492012-01-14 13:34:47 -05008604 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008605 return -1;
8606 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008607 /* find all unencodable characters */
8608 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008609 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008610 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008611 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008612 val = encoding_map_lookup(ch, mapping);
8613 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 break;
8615 ++collendpos;
8616 continue;
8617 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008618
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008619 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8620 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 if (rep==NULL)
8622 return -1;
8623 else if (rep!=Py_None) {
8624 Py_DECREF(rep);
8625 break;
8626 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008627 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629 }
8630 /* cache callback name lookup
8631 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008632 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008633 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008634
8635 switch (*error_handler) {
8636 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008637 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008638 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008639
8640 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008641 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 x = charmapencode_output('?', mapping, res, respos);
8643 if (x==enc_EXCEPTION) {
8644 return -1;
8645 }
8646 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008647 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 return -1;
8649 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008650 }
8651 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008652 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008653 *inpos = collendpos;
8654 break;
Victor Stinner50149202015-09-22 00:26:54 +02008655
8656 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008657 /* generate replacement (temporarily (mis)uses p) */
8658 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 char buffer[2+29+1+1];
8660 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008661 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 for (cp = buffer; *cp; ++cp) {
8663 x = charmapencode_output(*cp, mapping, res, respos);
8664 if (x==enc_EXCEPTION)
8665 return -1;
8666 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008667 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 return -1;
8669 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008670 }
8671 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008672 *inpos = collendpos;
8673 break;
Victor Stinner50149202015-09-22 00:26:54 +02008674
Benjamin Peterson14339b62009-01-31 16:36:08 +00008675 default:
Victor Stinner50149202015-09-22 00:26:54 +02008676 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008677 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008679 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008681 if (PyBytes_Check(repunicode)) {
8682 /* Directly copy bytes result to output. */
8683 Py_ssize_t outsize = PyBytes_Size(*res);
8684 Py_ssize_t requiredsize;
8685 repsize = PyBytes_Size(repunicode);
8686 requiredsize = *respos + repsize;
8687 if (requiredsize > outsize)
8688 /* Make room for all additional bytes. */
8689 if (charmapencode_resize(res, respos, requiredsize)) {
8690 Py_DECREF(repunicode);
8691 return -1;
8692 }
8693 memcpy(PyBytes_AsString(*res) + *respos,
8694 PyBytes_AsString(repunicode), repsize);
8695 *respos += repsize;
8696 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008697 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008698 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008699 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008700 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008701 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008702 Py_DECREF(repunicode);
8703 return -1;
8704 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008705 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008706 data = PyUnicode_DATA(repunicode);
8707 kind = PyUnicode_KIND(repunicode);
8708 for (index = 0; index < repsize; index++) {
8709 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8710 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008712 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 return -1;
8714 }
8715 else if (x==enc_FAILED) {
8716 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008717 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 return -1;
8719 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008720 }
8721 *inpos = newpos;
8722 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 }
8724 return 0;
8725}
8726
Alexander Belopolsky40018472011-02-26 01:02:56 +00008727PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008728_PyUnicode_EncodeCharmap(PyObject *unicode,
8729 PyObject *mapping,
8730 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 /* output object */
8733 PyObject *res = NULL;
8734 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008735 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008736 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008738 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008739 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008740 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008741 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008742 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008743 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744
Benjamin Petersonbac79492012-01-14 13:34:47 -05008745 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008746 return NULL;
8747 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008748 data = PyUnicode_DATA(unicode);
8749 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008750
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751 /* Default to Latin-1 */
8752 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008753 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755 /* allocate enough for a simple encoding without
8756 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008757 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758 if (res == NULL)
8759 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008760 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008764 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008766 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 if (x==enc_EXCEPTION) /* error */
8768 goto onError;
8769 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008770 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008772 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 &res, &respos)) {
8774 goto onError;
8775 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008776 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 else
8778 /* done with this character => adjust input position */
8779 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008782 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008783 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008784 if (_PyBytes_Resize(&res, respos) < 0)
8785 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008786
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008787 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008788 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789 return res;
8790
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792 Py_XDECREF(res);
8793 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008794 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795 return NULL;
8796}
8797
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008798/* Deprecated */
8799PyObject *
8800PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8801 Py_ssize_t size,
8802 PyObject *mapping,
8803 const char *errors)
8804{
8805 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008806 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008807 if (unicode == NULL)
8808 return NULL;
8809 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8810 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008811 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008812}
8813
Alexander Belopolsky40018472011-02-26 01:02:56 +00008814PyObject *
8815PyUnicode_AsCharmapString(PyObject *unicode,
8816 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817{
8818 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008819 PyErr_BadArgument();
8820 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008822 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823}
8824
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008825/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008826static void
8827make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008829 Py_ssize_t startpos, Py_ssize_t endpos,
8830 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008832 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 *exceptionObject = _PyUnicodeTranslateError_Create(
8834 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835 }
8836 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8838 goto onError;
8839 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8840 goto onError;
8841 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8842 goto onError;
8843 return;
8844 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008845 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 }
8847}
8848
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008849/* error handling callback helper:
8850 build arguments, call the callback and check the arguments,
8851 put the result into newpos and return the replacement string, which
8852 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008853static PyObject *
8854unicode_translate_call_errorhandler(const char *errors,
8855 PyObject **errorHandler,
8856 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008858 Py_ssize_t startpos, Py_ssize_t endpos,
8859 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008860{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008861 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008862
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008863 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008864 PyObject *restuple;
8865 PyObject *resunicode;
8866
8867 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008868 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008869 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008871 }
8872
8873 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008875 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008877
Petr Viktorinffd97532020-02-11 17:46:57 +01008878 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008879 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008881 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008882 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 Py_DECREF(restuple);
8884 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008885 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008886 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 &resunicode, &i_newpos)) {
8888 Py_DECREF(restuple);
8889 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008890 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008891 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008893 else
8894 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008896 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 Py_DECREF(restuple);
8898 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008899 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008900 Py_INCREF(resunicode);
8901 Py_DECREF(restuple);
8902 return resunicode;
8903}
8904
8905/* Lookup the character ch in the mapping and put the result in result,
8906 which must be decrefed by the caller.
8907 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008908static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008910{
Christian Heimes217cfd12007-12-02 14:31:20 +00008911 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008912 PyObject *x;
8913
8914 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008916 x = PyObject_GetItem(mapping, w);
8917 Py_DECREF(w);
8918 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8920 /* No mapping found means: use 1:1 mapping. */
8921 PyErr_Clear();
8922 *result = NULL;
8923 return 0;
8924 } else
8925 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008926 }
8927 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008928 *result = x;
8929 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008930 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008931 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008932 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008933 if (value < 0 || value > MAX_UNICODE) {
8934 PyErr_Format(PyExc_ValueError,
8935 "character mapping must be in range(0x%x)",
8936 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 Py_DECREF(x);
8938 return -1;
8939 }
8940 *result = x;
8941 return 0;
8942 }
8943 else if (PyUnicode_Check(x)) {
8944 *result = x;
8945 return 0;
8946 }
8947 else {
8948 /* wrong return value */
8949 PyErr_SetString(PyExc_TypeError,
8950 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008951 Py_DECREF(x);
8952 return -1;
8953 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008954}
Victor Stinner1194ea02014-04-04 19:37:40 +02008955
8956/* lookup the character, write the result into the writer.
8957 Return 1 if the result was written into the writer, return 0 if the mapping
8958 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008959static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008960charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8961 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008962{
Victor Stinner1194ea02014-04-04 19:37:40 +02008963 PyObject *item;
8964
8965 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008966 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008967
8968 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008970 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008973 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008974 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008975
8976 if (item == Py_None) {
8977 Py_DECREF(item);
8978 return 0;
8979 }
8980
8981 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008982 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8983 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8984 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008985 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8986 Py_DECREF(item);
8987 return -1;
8988 }
8989 Py_DECREF(item);
8990 return 1;
8991 }
8992
8993 if (!PyUnicode_Check(item)) {
8994 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008996 }
8997
8998 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8999 Py_DECREF(item);
9000 return -1;
9001 }
9002
9003 Py_DECREF(item);
9004 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009005}
9006
Victor Stinner89a76ab2014-04-05 11:44:04 +02009007static int
9008unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9009 Py_UCS1 *translate)
9010{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009011 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009012 int ret = 0;
9013
Victor Stinner89a76ab2014-04-05 11:44:04 +02009014 if (charmaptranslate_lookup(ch, mapping, &item)) {
9015 return -1;
9016 }
9017
9018 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009019 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009020 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009021 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009022 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009023 /* not found => default to 1:1 mapping */
9024 translate[ch] = ch;
9025 return 1;
9026 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009027 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009028 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009029 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9030 used it */
9031 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009032 /* invalid character or character outside ASCII:
9033 skip the fast translate */
9034 goto exit;
9035 }
9036 translate[ch] = (Py_UCS1)replace;
9037 }
9038 else if (PyUnicode_Check(item)) {
9039 Py_UCS4 replace;
9040
9041 if (PyUnicode_READY(item) == -1) {
9042 Py_DECREF(item);
9043 return -1;
9044 }
9045 if (PyUnicode_GET_LENGTH(item) != 1)
9046 goto exit;
9047
9048 replace = PyUnicode_READ_CHAR(item, 0);
9049 if (replace > 127)
9050 goto exit;
9051 translate[ch] = (Py_UCS1)replace;
9052 }
9053 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009054 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009055 goto exit;
9056 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009057 ret = 1;
9058
Benjamin Peterson1365de72014-04-07 20:15:41 -04009059 exit:
9060 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009061 return ret;
9062}
9063
9064/* Fast path for ascii => ascii translation. Return 1 if the whole string
9065 was translated into writer, return 0 if the input string was partially
9066 translated into writer, raise an exception and return -1 on error. */
9067static int
9068unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009069 _PyUnicodeWriter *writer, int ignore,
9070 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009071{
Victor Stinner872b2912014-04-05 14:27:07 +02009072 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009073 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009074 const Py_UCS1 *in, *end;
9075 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009076 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009077
Victor Stinner89a76ab2014-04-05 11:44:04 +02009078 len = PyUnicode_GET_LENGTH(input);
9079
Victor Stinner872b2912014-04-05 14:27:07 +02009080 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009081
9082 in = PyUnicode_1BYTE_DATA(input);
9083 end = in + len;
9084
9085 assert(PyUnicode_IS_ASCII(writer->buffer));
9086 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9087 out = PyUnicode_1BYTE_DATA(writer->buffer);
9088
Victor Stinner872b2912014-04-05 14:27:07 +02009089 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009090 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009091 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009092 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009093 int translate = unicode_fast_translate_lookup(mapping, ch,
9094 ascii_table);
9095 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009096 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009097 if (translate == 0)
9098 goto exit;
9099 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009100 }
Victor Stinner872b2912014-04-05 14:27:07 +02009101 if (ch2 == 0xfe) {
9102 if (ignore)
9103 continue;
9104 goto exit;
9105 }
9106 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009107 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009108 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009109 }
Victor Stinner872b2912014-04-05 14:27:07 +02009110 res = 1;
9111
9112exit:
9113 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009114 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009115 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009116}
9117
Victor Stinner3222da22015-10-01 22:07:32 +02009118static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119_PyUnicode_TranslateCharmap(PyObject *input,
9120 PyObject *mapping,
9121 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009124 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 Py_ssize_t size, i;
9126 int kind;
9127 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009128 _PyUnicodeWriter writer;
9129 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009130 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009131 PyObject *errorHandler = NULL;
9132 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009133 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009134 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009135
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 PyErr_BadArgument();
9138 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 if (PyUnicode_READY(input) == -1)
9142 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009143 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 kind = PyUnicode_KIND(input);
9145 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009147 if (size == 0)
9148 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009150 /* allocate enough for a simple 1:1 translation without
9151 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009152 _PyUnicodeWriter_Init(&writer);
9153 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155
Victor Stinner872b2912014-04-05 14:27:07 +02009156 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9157
Victor Stinner33798672016-03-01 21:59:58 +01009158 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009159 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009160 if (PyUnicode_IS_ASCII(input)) {
9161 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9162 if (res < 0) {
9163 _PyUnicodeWriter_Dealloc(&writer);
9164 return NULL;
9165 }
9166 if (res == 1)
9167 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009168 }
Victor Stinner33798672016-03-01 21:59:58 +01009169 else {
9170 i = 0;
9171 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009175 int translate;
9176 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9177 Py_ssize_t newpos;
9178 /* startpos for collecting untranslatable chars */
9179 Py_ssize_t collstart;
9180 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009181 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182
Victor Stinner1194ea02014-04-04 19:37:40 +02009183 ch = PyUnicode_READ(kind, data, i);
9184 translate = charmaptranslate_output(ch, mapping, &writer);
9185 if (translate < 0)
9186 goto onError;
9187
9188 if (translate != 0) {
9189 /* it worked => adjust input pointer */
9190 ++i;
9191 continue;
9192 }
9193
9194 /* untranslatable character */
9195 collstart = i;
9196 collend = i+1;
9197
9198 /* find all untranslatable characters */
9199 while (collend < size) {
9200 PyObject *x;
9201 ch = PyUnicode_READ(kind, data, collend);
9202 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009203 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009204 Py_XDECREF(x);
9205 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009207 ++collend;
9208 }
9209
9210 if (ignore) {
9211 i = collend;
9212 }
9213 else {
9214 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9215 reason, input, &exc,
9216 collstart, collend, &newpos);
9217 if (repunicode == NULL)
9218 goto onError;
9219 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009221 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009222 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009223 Py_DECREF(repunicode);
9224 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009225 }
9226 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009227 Py_XDECREF(exc);
9228 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009229 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230
Benjamin Peterson29060642009-01-31 22:14:21 +00009231 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009232 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009233 Py_XDECREF(exc);
9234 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235 return NULL;
9236}
9237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238/* Deprecated. Use PyUnicode_Translate instead. */
9239PyObject *
9240PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9241 Py_ssize_t size,
9242 PyObject *mapping,
9243 const char *errors)
9244{
Christian Heimes5f520f42012-09-11 14:03:25 +02009245 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009246 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 if (!unicode)
9248 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009249 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9250 Py_DECREF(unicode);
9251 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252}
9253
Alexander Belopolsky40018472011-02-26 01:02:56 +00009254PyObject *
9255PyUnicode_Translate(PyObject *str,
9256 PyObject *mapping,
9257 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009259 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009260 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009261 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262}
Tim Petersced69f82003-09-16 20:30:58 +00009263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264PyObject *
9265_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9266{
9267 if (!PyUnicode_Check(unicode)) {
9268 PyErr_BadInternalCall();
9269 return NULL;
9270 }
9271 if (PyUnicode_READY(unicode) == -1)
9272 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009273 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 /* If the string is already ASCII, just return the same string */
9275 Py_INCREF(unicode);
9276 return unicode;
9277 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009278
9279 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9280 PyObject *result = PyUnicode_New(len, 127);
9281 if (result == NULL) {
9282 return NULL;
9283 }
9284
9285 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9286 int kind = PyUnicode_KIND(unicode);
9287 const void *data = PyUnicode_DATA(unicode);
9288 Py_ssize_t i;
9289 for (i = 0; i < len; ++i) {
9290 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9291 if (ch < 127) {
9292 out[i] = ch;
9293 }
9294 else if (Py_UNICODE_ISSPACE(ch)) {
9295 out[i] = ' ';
9296 }
9297 else {
9298 int decimal = Py_UNICODE_TODECIMAL(ch);
9299 if (decimal < 0) {
9300 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009301 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009302 _PyUnicode_LENGTH(result) = i + 1;
9303 break;
9304 }
9305 out[i] = '0' + decimal;
9306 }
9307 }
9308
INADA Naoki16dfca42018-07-14 12:06:43 +09009309 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009310 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311}
9312
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009313PyObject *
9314PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9315 Py_ssize_t length)
9316{
Victor Stinnerf0124502011-11-21 23:12:56 +01009317 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009318 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009319 Py_UCS4 maxchar;
9320 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009321 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009322
Victor Stinner99d7ad02012-02-22 13:37:39 +01009323 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009324 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009325 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009326 if (ch > 127) {
9327 int decimal = Py_UNICODE_TODECIMAL(ch);
9328 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009329 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009330 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009331 }
9332 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009333
9334 /* Copy to a new string */
9335 decimal = PyUnicode_New(length, maxchar);
9336 if (decimal == NULL)
9337 return decimal;
9338 kind = PyUnicode_KIND(decimal);
9339 data = PyUnicode_DATA(decimal);
9340 /* Iterate over code points */
9341 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009342 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009343 if (ch > 127) {
9344 int decimal = Py_UNICODE_TODECIMAL(ch);
9345 if (decimal >= 0)
9346 ch = '0' + decimal;
9347 }
9348 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009350 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009351}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009352/* --- Decimal Encoder ---------------------------------------------------- */
9353
Alexander Belopolsky40018472011-02-26 01:02:56 +00009354int
9355PyUnicode_EncodeDecimal(Py_UNICODE *s,
9356 Py_ssize_t length,
9357 char *output,
9358 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009359{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009360 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009361 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009362 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009363 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009364
9365 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009366 PyErr_BadArgument();
9367 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009368 }
9369
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009370 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009371 if (unicode == NULL)
9372 return -1;
9373
Victor Stinner42bf7752011-11-21 22:52:58 +01009374 kind = PyUnicode_KIND(unicode);
9375 data = PyUnicode_DATA(unicode);
9376
Victor Stinnerb84d7232011-11-22 01:50:07 +01009377 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009378 PyObject *exc;
9379 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009381 Py_ssize_t startpos;
9382
9383 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009384
Benjamin Peterson29060642009-01-31 22:14:21 +00009385 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009386 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009387 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009389 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009390 decimal = Py_UNICODE_TODECIMAL(ch);
9391 if (decimal >= 0) {
9392 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009393 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 continue;
9395 }
9396 if (0 < ch && ch < 256) {
9397 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009398 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 continue;
9400 }
Victor Stinner6345be92011-11-25 20:09:01 +01009401
Victor Stinner42bf7752011-11-21 22:52:58 +01009402 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009403 exc = NULL;
9404 raise_encode_exception(&exc, "decimal", unicode,
9405 startpos, startpos+1,
9406 "invalid decimal Unicode string");
9407 Py_XDECREF(exc);
9408 Py_DECREF(unicode);
9409 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009410 }
9411 /* 0-terminate the output string */
9412 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009413 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009414 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009415}
9416
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417/* --- Helpers ------------------------------------------------------------ */
9418
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009419/* helper macro to fixup start/end slice values */
9420#define ADJUST_INDICES(start, end, len) \
9421 if (end > len) \
9422 end = len; \
9423 else if (end < 0) { \
9424 end += len; \
9425 if (end < 0) \
9426 end = 0; \
9427 } \
9428 if (start < 0) { \
9429 start += len; \
9430 if (start < 0) \
9431 start = 0; \
9432 }
9433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009435any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009437 Py_ssize_t end,
9438 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009440 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009441 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 Py_ssize_t len1, len2, result;
9443
9444 kind1 = PyUnicode_KIND(s1);
9445 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009446 if (kind1 < kind2)
9447 return -1;
9448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 len1 = PyUnicode_GET_LENGTH(s1);
9450 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009451 ADJUST_INDICES(start, end, len1);
9452 if (end - start < len2)
9453 return -1;
9454
9455 buf1 = PyUnicode_DATA(s1);
9456 buf2 = PyUnicode_DATA(s2);
9457 if (len2 == 1) {
9458 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9459 result = findchar((const char *)buf1 + kind1*start,
9460 kind1, end - start, ch, direction);
9461 if (result == -1)
9462 return -1;
9463 else
9464 return start + result;
9465 }
9466
9467 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009468 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009469 if (!buf2)
9470 return -2;
9471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472
Victor Stinner794d5672011-10-10 03:21:36 +02009473 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009474 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009475 case PyUnicode_1BYTE_KIND:
9476 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9477 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9478 else
9479 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9480 break;
9481 case PyUnicode_2BYTE_KIND:
9482 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9483 break;
9484 case PyUnicode_4BYTE_KIND:
9485 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9486 break;
9487 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009488 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009489 }
9490 }
9491 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009492 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009493 case PyUnicode_1BYTE_KIND:
9494 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9495 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9496 else
9497 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9498 break;
9499 case PyUnicode_2BYTE_KIND:
9500 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9501 break;
9502 case PyUnicode_4BYTE_KIND:
9503 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9504 break;
9505 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009506 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 }
9509
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009510 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009511 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009512 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513
9514 return result;
9515}
9516
Victor Stinner59423e32018-11-26 13:40:01 +01009517/* _PyUnicode_InsertThousandsGrouping() helper functions */
9518#include "stringlib/localeutil.h"
9519
9520/**
9521 * InsertThousandsGrouping:
9522 * @writer: Unicode writer.
9523 * @n_buffer: Number of characters in @buffer.
9524 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9525 * @d_pos: Start of digits string.
9526 * @n_digits: The number of digits in the string, in which we want
9527 * to put the grouping chars.
9528 * @min_width: The minimum width of the digits in the output string.
9529 * Output will be zero-padded on the left to fill.
9530 * @grouping: see definition in localeconv().
9531 * @thousands_sep: see definition in localeconv().
9532 *
9533 * There are 2 modes: counting and filling. If @writer is NULL,
9534 * we are in counting mode, else filling mode.
9535 * If counting, the required buffer size is returned.
9536 * If filling, we know the buffer will be large enough, so we don't
9537 * need to pass in the buffer size.
9538 * Inserts thousand grouping characters (as defined by grouping and
9539 * thousands_sep) into @writer.
9540 *
9541 * Return value: -1 on error, number of characters otherwise.
9542 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009544_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009545 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009546 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009547 PyObject *digits,
9548 Py_ssize_t d_pos,
9549 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009550 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009551 const char *grouping,
9552 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009553 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554{
Xtreak3f7983a2019-01-07 20:39:14 +05309555 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009556 if (writer) {
9557 assert(digits != NULL);
9558 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009559 }
9560 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009561 assert(digits == NULL);
9562 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009563 }
Victor Stinner59423e32018-11-26 13:40:01 +01009564 assert(0 <= d_pos);
9565 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009566 assert(grouping != NULL);
9567
9568 if (digits != NULL) {
9569 if (PyUnicode_READY(digits) == -1) {
9570 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009571 }
Victor Stinner59423e32018-11-26 13:40:01 +01009572 }
9573 if (PyUnicode_READY(thousands_sep) == -1) {
9574 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009575 }
9576
Victor Stinner59423e32018-11-26 13:40:01 +01009577 Py_ssize_t count = 0;
9578 Py_ssize_t n_zeros;
9579 int loop_broken = 0;
9580 int use_separator = 0; /* First time through, don't append the
9581 separator. They only go between
9582 groups. */
9583 Py_ssize_t buffer_pos;
9584 Py_ssize_t digits_pos;
9585 Py_ssize_t len;
9586 Py_ssize_t n_chars;
9587 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9588 be looked at */
9589 /* A generator that returns all of the grouping widths, until it
9590 returns 0. */
9591 GroupGenerator groupgen;
9592 GroupGenerator_init(&groupgen, grouping);
9593 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9594
9595 /* if digits are not grouped, thousands separator
9596 should be an empty string */
9597 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9598
9599 digits_pos = d_pos + n_digits;
9600 if (writer) {
9601 buffer_pos = writer->pos + n_buffer;
9602 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9603 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 }
Victor Stinner59423e32018-11-26 13:40:01 +01009605 else {
9606 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009607 }
Victor Stinner59423e32018-11-26 13:40:01 +01009608
9609 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009610 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009611 }
Victor Stinner59423e32018-11-26 13:40:01 +01009612
9613 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9614 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9615 n_zeros = Py_MAX(0, len - remaining);
9616 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9617
9618 /* Use n_zero zero's and n_chars chars */
9619
9620 /* Count only, don't do anything. */
9621 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9622
9623 /* Copy into the writer. */
9624 InsertThousandsGrouping_fill(writer, &buffer_pos,
9625 digits, &digits_pos,
9626 n_chars, n_zeros,
9627 use_separator ? thousands_sep : NULL,
9628 thousands_sep_len, maxchar);
9629
9630 /* Use a separator next time. */
9631 use_separator = 1;
9632
9633 remaining -= n_chars;
9634 min_width -= len;
9635
9636 if (remaining <= 0 && min_width <= 0) {
9637 loop_broken = 1;
9638 break;
9639 }
9640 min_width -= thousands_sep_len;
9641 }
9642 if (!loop_broken) {
9643 /* We left the loop without using a break statement. */
9644
9645 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9646 n_zeros = Py_MAX(0, len - remaining);
9647 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9648
9649 /* Use n_zero zero's and n_chars chars */
9650 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9651
9652 /* Copy into the writer. */
9653 InsertThousandsGrouping_fill(writer, &buffer_pos,
9654 digits, &digits_pos,
9655 n_chars, n_zeros,
9656 use_separator ? thousands_sep : NULL,
9657 thousands_sep_len, maxchar);
9658 }
9659 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660}
9661
9662
Alexander Belopolsky40018472011-02-26 01:02:56 +00009663Py_ssize_t
9664PyUnicode_Count(PyObject *str,
9665 PyObject *substr,
9666 Py_ssize_t start,
9667 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009669 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009670 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009671 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009673
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009674 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009675 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009676
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009677 kind1 = PyUnicode_KIND(str);
9678 kind2 = PyUnicode_KIND(substr);
9679 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009680 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009681
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009682 len1 = PyUnicode_GET_LENGTH(str);
9683 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009685 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009686 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009687
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009688 buf1 = PyUnicode_DATA(str);
9689 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009690 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009691 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009692 if (!buf2)
9693 goto onError;
9694 }
9695
9696 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009698 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009699 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009700 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009701 buf2, len2, PY_SSIZE_T_MAX
9702 );
9703 else
9704 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009705 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009706 buf2, len2, PY_SSIZE_T_MAX
9707 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 break;
9709 case PyUnicode_2BYTE_KIND:
9710 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009711 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 buf2, len2, PY_SSIZE_T_MAX
9713 );
9714 break;
9715 case PyUnicode_4BYTE_KIND:
9716 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009717 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 buf2, len2, PY_SSIZE_T_MAX
9719 );
9720 break;
9721 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009722 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009724
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009725 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009726 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009727 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009731 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9732 if (kind2 != kind1)
9733 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735}
9736
Alexander Belopolsky40018472011-02-26 01:02:56 +00009737Py_ssize_t
9738PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009739 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009740 Py_ssize_t start,
9741 Py_ssize_t end,
9742 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009744 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009745 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009746
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009747 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748}
9749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750Py_ssize_t
9751PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9752 Py_ssize_t start, Py_ssize_t end,
9753 int direction)
9754{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009756 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 if (PyUnicode_READY(str) == -1)
9758 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009759 len = PyUnicode_GET_LENGTH(str);
9760 ADJUST_INDICES(start, end, len);
9761 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009762 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009764 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9765 kind, end-start, ch, direction);
9766 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009768 else
9769 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770}
9771
Alexander Belopolsky40018472011-02-26 01:02:56 +00009772static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009773tailmatch(PyObject *self,
9774 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009775 Py_ssize_t start,
9776 Py_ssize_t end,
9777 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 int kind_self;
9780 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009781 const void *data_self;
9782 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 Py_ssize_t offset;
9784 Py_ssize_t i;
9785 Py_ssize_t end_sub;
9786
9787 if (PyUnicode_READY(self) == -1 ||
9788 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009789 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9792 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009794 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009796 if (PyUnicode_GET_LENGTH(substring) == 0)
9797 return 1;
9798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 kind_self = PyUnicode_KIND(self);
9800 data_self = PyUnicode_DATA(self);
9801 kind_sub = PyUnicode_KIND(substring);
9802 data_sub = PyUnicode_DATA(substring);
9803 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9804
9805 if (direction > 0)
9806 offset = end;
9807 else
9808 offset = start;
9809
9810 if (PyUnicode_READ(kind_self, data_self, offset) ==
9811 PyUnicode_READ(kind_sub, data_sub, 0) &&
9812 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9813 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9814 /* If both are of the same kind, memcmp is sufficient */
9815 if (kind_self == kind_sub) {
9816 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009817 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 data_sub,
9819 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009820 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009822 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 else {
9824 /* We do not need to compare 0 and len(substring)-1 because
9825 the if statement above ensured already that they are equal
9826 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 for (i = 1; i < end_sub; ++i) {
9828 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9829 PyUnicode_READ(kind_sub, data_sub, i))
9830 return 0;
9831 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009832 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834 }
9835
9836 return 0;
9837}
9838
Alexander Belopolsky40018472011-02-26 01:02:56 +00009839Py_ssize_t
9840PyUnicode_Tailmatch(PyObject *str,
9841 PyObject *substr,
9842 Py_ssize_t start,
9843 Py_ssize_t end,
9844 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009846 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009847 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009848
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009849 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850}
9851
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009852static PyObject *
9853ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009855 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009856 const char *data = PyUnicode_DATA(self);
9857 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009858 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009859
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009860 res = PyUnicode_New(len, 127);
9861 if (res == NULL)
9862 return NULL;
9863 resdata = PyUnicode_DATA(res);
9864 if (lower)
9865 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009867 _Py_bytes_upper(resdata, data, len);
9868 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869}
9870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009872handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009874 Py_ssize_t j;
9875 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009876 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009877 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009878
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009879 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9880
9881 where ! is a negation and \p{xxx} is a character with property xxx.
9882 */
9883 for (j = i - 1; j >= 0; j--) {
9884 c = PyUnicode_READ(kind, data, j);
9885 if (!_PyUnicode_IsCaseIgnorable(c))
9886 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009888 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9889 if (final_sigma) {
9890 for (j = i + 1; j < length; j++) {
9891 c = PyUnicode_READ(kind, data, j);
9892 if (!_PyUnicode_IsCaseIgnorable(c))
9893 break;
9894 }
9895 final_sigma = j == length || !_PyUnicode_IsCased(c);
9896 }
9897 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898}
9899
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009900static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009901lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009902 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009904 /* Obscure special case. */
9905 if (c == 0x3A3) {
9906 mapped[0] = handle_capital_sigma(kind, data, length, i);
9907 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009909 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910}
9911
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009912static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009913do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009915 Py_ssize_t i, k = 0;
9916 int n_res, j;
9917 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009918
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009919 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009920 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009921 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009922 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009923 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009925 for (i = 1; i < length; i++) {
9926 c = PyUnicode_READ(kind, data, i);
9927 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9928 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009929 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009930 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009931 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009932 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009933 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934}
9935
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009936static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009937do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009938 Py_ssize_t i, k = 0;
9939
9940 for (i = 0; i < length; i++) {
9941 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9942 int n_res, j;
9943 if (Py_UNICODE_ISUPPER(c)) {
9944 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9945 }
9946 else if (Py_UNICODE_ISLOWER(c)) {
9947 n_res = _PyUnicode_ToUpperFull(c, mapped);
9948 }
9949 else {
9950 n_res = 1;
9951 mapped[0] = c;
9952 }
9953 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009954 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009955 res[k++] = mapped[j];
9956 }
9957 }
9958 return k;
9959}
9960
9961static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009962do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009963 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009965 Py_ssize_t i, k = 0;
9966
9967 for (i = 0; i < length; i++) {
9968 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9969 int n_res, j;
9970 if (lower)
9971 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9972 else
9973 n_res = _PyUnicode_ToUpperFull(c, mapped);
9974 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009975 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009976 res[k++] = mapped[j];
9977 }
9978 }
9979 return k;
9980}
9981
9982static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009983do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009984{
9985 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9986}
9987
9988static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009989do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009990{
9991 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9992}
9993
Benjamin Petersone51757f2012-01-12 21:10:29 -05009994static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009995do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -05009996{
9997 Py_ssize_t i, k = 0;
9998
9999 for (i = 0; i < length; i++) {
10000 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10001 Py_UCS4 mapped[3];
10002 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10003 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010004 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010005 res[k++] = mapped[j];
10006 }
10007 }
10008 return k;
10009}
10010
10011static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010012do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010013{
10014 Py_ssize_t i, k = 0;
10015 int previous_is_cased;
10016
10017 previous_is_cased = 0;
10018 for (i = 0; i < length; i++) {
10019 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10020 Py_UCS4 mapped[3];
10021 int n_res, j;
10022
10023 if (previous_is_cased)
10024 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10025 else
10026 n_res = _PyUnicode_ToTitleFull(c, mapped);
10027
10028 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010029 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010030 res[k++] = mapped[j];
10031 }
10032
10033 previous_is_cased = _PyUnicode_IsCased(c);
10034 }
10035 return k;
10036}
10037
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010038static PyObject *
10039case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010040 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010041{
10042 PyObject *res = NULL;
10043 Py_ssize_t length, newlength = 0;
10044 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010045 const void *data;
10046 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010047 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10048
Benjamin Petersoneea48462012-01-16 14:28:50 -050010049 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010050
10051 kind = PyUnicode_KIND(self);
10052 data = PyUnicode_DATA(self);
10053 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010054 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010055 PyErr_SetString(PyExc_OverflowError, "string is too long");
10056 return NULL;
10057 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010058 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010059 if (tmp == NULL)
10060 return PyErr_NoMemory();
10061 newlength = perform(kind, data, length, tmp, &maxchar);
10062 res = PyUnicode_New(newlength, maxchar);
10063 if (res == NULL)
10064 goto leave;
10065 tmpend = tmp + newlength;
10066 outdata = PyUnicode_DATA(res);
10067 outkind = PyUnicode_KIND(res);
10068 switch (outkind) {
10069 case PyUnicode_1BYTE_KIND:
10070 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10071 break;
10072 case PyUnicode_2BYTE_KIND:
10073 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10074 break;
10075 case PyUnicode_4BYTE_KIND:
10076 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10077 break;
10078 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010079 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010080 }
10081 leave:
10082 PyMem_FREE(tmp);
10083 return res;
10084}
10085
Tim Peters8ce9f162004-08-27 01:49:32 +000010086PyObject *
10087PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010089 PyObject *res;
10090 PyObject *fseq;
10091 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010092 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010094 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010095 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010096 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010097 }
10098
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010099 /* NOTE: the following code can't call back into Python code,
10100 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010101 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010102
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010103 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010104 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010105 res = _PyUnicode_JoinArray(separator, items, seqlen);
10106 Py_DECREF(fseq);
10107 return res;
10108}
10109
10110PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010111_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010112{
10113 PyObject *res = NULL; /* the result */
10114 PyObject *sep = NULL;
10115 Py_ssize_t seplen;
10116 PyObject *item;
10117 Py_ssize_t sz, i, res_offset;
10118 Py_UCS4 maxchar;
10119 Py_UCS4 item_maxchar;
10120 int use_memcpy;
10121 unsigned char *res_data = NULL, *sep_data = NULL;
10122 PyObject *last_obj;
10123 unsigned int kind = 0;
10124
Tim Peters05eba1f2004-08-27 21:32:02 +000010125 /* If empty sequence, return u"". */
10126 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010127 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010128 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010129
Tim Peters05eba1f2004-08-27 21:32:02 +000010130 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010131 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010132 if (seqlen == 1) {
10133 if (PyUnicode_CheckExact(items[0])) {
10134 res = items[0];
10135 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010136 return res;
10137 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010138 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010139 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010140 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010141 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010142 /* Set up sep and seplen */
10143 if (separator == NULL) {
10144 /* fall back to a blank space separator */
10145 sep = PyUnicode_FromOrdinal(' ');
10146 if (!sep)
10147 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010148 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010149 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010150 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010151 else {
10152 if (!PyUnicode_Check(separator)) {
10153 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010154 "separator: expected str instance,"
10155 " %.80s found",
10156 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010157 goto onError;
10158 }
10159 if (PyUnicode_READY(separator))
10160 goto onError;
10161 sep = separator;
10162 seplen = PyUnicode_GET_LENGTH(separator);
10163 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10164 /* inc refcount to keep this code path symmetric with the
10165 above case of a blank separator */
10166 Py_INCREF(sep);
10167 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010168 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010169 }
10170
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010171 /* There are at least two things to join, or else we have a subclass
10172 * of str in the sequence.
10173 * Do a pre-pass to figure out the total amount of space we'll
10174 * need (sz), and see whether all argument are strings.
10175 */
10176 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010177#ifdef Py_DEBUG
10178 use_memcpy = 0;
10179#else
10180 use_memcpy = 1;
10181#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010182 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010183 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010184 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 if (!PyUnicode_Check(item)) {
10186 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010187 "sequence item %zd: expected str instance,"
10188 " %.80s found",
10189 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010190 goto onError;
10191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 if (PyUnicode_READY(item) == -1)
10193 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010194 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010196 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010197 if (i != 0) {
10198 add_sz += seplen;
10199 }
10200 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010201 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010202 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010203 goto onError;
10204 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010205 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010206 if (use_memcpy && last_obj != NULL) {
10207 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10208 use_memcpy = 0;
10209 }
10210 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010211 }
Tim Petersced69f82003-09-16 20:30:58 +000010212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010214 if (res == NULL)
10215 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010216
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010217 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010218#ifdef Py_DEBUG
10219 use_memcpy = 0;
10220#else
10221 if (use_memcpy) {
10222 res_data = PyUnicode_1BYTE_DATA(res);
10223 kind = PyUnicode_KIND(res);
10224 if (seplen != 0)
10225 sep_data = PyUnicode_1BYTE_DATA(sep);
10226 }
10227#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010228 if (use_memcpy) {
10229 for (i = 0; i < seqlen; ++i) {
10230 Py_ssize_t itemlen;
10231 item = items[i];
10232
10233 /* Copy item, and maybe the separator. */
10234 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010235 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010236 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010237 kind * seplen);
10238 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010239 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010240
10241 itemlen = PyUnicode_GET_LENGTH(item);
10242 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010243 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010244 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010245 kind * itemlen);
10246 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010247 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010248 }
10249 assert(res_data == PyUnicode_1BYTE_DATA(res)
10250 + kind * PyUnicode_GET_LENGTH(res));
10251 }
10252 else {
10253 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10254 Py_ssize_t itemlen;
10255 item = items[i];
10256
10257 /* Copy item, and maybe the separator. */
10258 if (i && seplen != 0) {
10259 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10260 res_offset += seplen;
10261 }
10262
10263 itemlen = PyUnicode_GET_LENGTH(item);
10264 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010265 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010266 res_offset += itemlen;
10267 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010268 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010269 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010270 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010273 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275
Benjamin Peterson29060642009-01-31 22:14:21 +000010276 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010278 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279 return NULL;
10280}
10281
Victor Stinnerd3f08822012-05-29 12:57:52 +020010282void
10283_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10284 Py_UCS4 fill_char)
10285{
10286 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010287 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010288 assert(PyUnicode_IS_READY(unicode));
10289 assert(unicode_modifiable(unicode));
10290 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10291 assert(start >= 0);
10292 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010293 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010294}
10295
Victor Stinner3fe55312012-01-04 00:33:50 +010010296Py_ssize_t
10297PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10298 Py_UCS4 fill_char)
10299{
10300 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010301
10302 if (!PyUnicode_Check(unicode)) {
10303 PyErr_BadInternalCall();
10304 return -1;
10305 }
10306 if (PyUnicode_READY(unicode) == -1)
10307 return -1;
10308 if (unicode_check_modifiable(unicode))
10309 return -1;
10310
Victor Stinnerd3f08822012-05-29 12:57:52 +020010311 if (start < 0) {
10312 PyErr_SetString(PyExc_IndexError, "string index out of range");
10313 return -1;
10314 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010315 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10316 PyErr_SetString(PyExc_ValueError,
10317 "fill character is bigger than "
10318 "the string maximum character");
10319 return -1;
10320 }
10321
10322 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10323 length = Py_MIN(maxlen, length);
10324 if (length <= 0)
10325 return 0;
10326
Victor Stinnerd3f08822012-05-29 12:57:52 +020010327 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010328 return length;
10329}
10330
Victor Stinner9310abb2011-10-05 00:59:23 +020010331static PyObject *
10332pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010333 Py_ssize_t left,
10334 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 PyObject *u;
10338 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010339 int kind;
10340 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341
10342 if (left < 0)
10343 left = 0;
10344 if (right < 0)
10345 right = 0;
10346
Victor Stinnerc4b49542011-12-11 22:44:26 +010010347 if (left == 0 && right == 0)
10348 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10351 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010352 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10353 return NULL;
10354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010356 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010358 if (!u)
10359 return NULL;
10360
10361 kind = PyUnicode_KIND(u);
10362 data = PyUnicode_DATA(u);
10363 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010364 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010365 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010366 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010367 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010368 assert(_PyUnicode_CheckConsistency(u, 1));
10369 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370}
10371
Alexander Belopolsky40018472011-02-26 01:02:56 +000010372PyObject *
10373PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010377 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379
Benjamin Petersonead6b532011-12-20 17:23:42 -060010380 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010382 if (PyUnicode_IS_ASCII(string))
10383 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010384 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010385 PyUnicode_GET_LENGTH(string), keepends);
10386 else
10387 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010388 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010389 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 break;
10391 case PyUnicode_2BYTE_KIND:
10392 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010393 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 PyUnicode_GET_LENGTH(string), keepends);
10395 break;
10396 case PyUnicode_4BYTE_KIND:
10397 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010398 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 PyUnicode_GET_LENGTH(string), keepends);
10400 break;
10401 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010402 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405}
10406
Alexander Belopolsky40018472011-02-26 01:02:56 +000010407static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010408split(PyObject *self,
10409 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010410 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010412 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010413 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 Py_ssize_t len1, len2;
10415 PyObject* out;
10416
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010418 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 if (PyUnicode_READY(self) == -1)
10421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010424 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010426 if (PyUnicode_IS_ASCII(self))
10427 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010428 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010429 PyUnicode_GET_LENGTH(self), maxcount
10430 );
10431 else
10432 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010433 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010434 PyUnicode_GET_LENGTH(self), maxcount
10435 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 case PyUnicode_2BYTE_KIND:
10437 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010438 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 PyUnicode_GET_LENGTH(self), maxcount
10440 );
10441 case PyUnicode_4BYTE_KIND:
10442 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010443 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 PyUnicode_GET_LENGTH(self), maxcount
10445 );
10446 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010447 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 }
10449
10450 if (PyUnicode_READY(substring) == -1)
10451 return NULL;
10452
10453 kind1 = PyUnicode_KIND(self);
10454 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 len1 = PyUnicode_GET_LENGTH(self);
10456 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010457 if (kind1 < kind2 || len1 < len2) {
10458 out = PyList_New(1);
10459 if (out == NULL)
10460 return NULL;
10461 Py_INCREF(self);
10462 PyList_SET_ITEM(out, 0, self);
10463 return out;
10464 }
10465 buf1 = PyUnicode_DATA(self);
10466 buf2 = PyUnicode_DATA(substring);
10467 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010468 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010469 if (!buf2)
10470 return NULL;
10471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010473 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010475 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10476 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010477 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010478 else
10479 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010480 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 break;
10482 case PyUnicode_2BYTE_KIND:
10483 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010484 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 break;
10486 case PyUnicode_4BYTE_KIND:
10487 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010488 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 break;
10490 default:
10491 out = NULL;
10492 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010493 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010494 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010495 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497}
10498
Alexander Belopolsky40018472011-02-26 01:02:56 +000010499static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010500rsplit(PyObject *self,
10501 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010502 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010503{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010504 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010505 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 Py_ssize_t len1, len2;
10507 PyObject* out;
10508
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010509 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010510 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 if (PyUnicode_READY(self) == -1)
10513 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010516 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010518 if (PyUnicode_IS_ASCII(self))
10519 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010520 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010521 PyUnicode_GET_LENGTH(self), maxcount
10522 );
10523 else
10524 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010525 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010526 PyUnicode_GET_LENGTH(self), maxcount
10527 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 case PyUnicode_2BYTE_KIND:
10529 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010530 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 PyUnicode_GET_LENGTH(self), maxcount
10532 );
10533 case PyUnicode_4BYTE_KIND:
10534 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010535 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 PyUnicode_GET_LENGTH(self), maxcount
10537 );
10538 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010539 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 }
10541
10542 if (PyUnicode_READY(substring) == -1)
10543 return NULL;
10544
10545 kind1 = PyUnicode_KIND(self);
10546 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 len1 = PyUnicode_GET_LENGTH(self);
10548 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010549 if (kind1 < kind2 || len1 < len2) {
10550 out = PyList_New(1);
10551 if (out == NULL)
10552 return NULL;
10553 Py_INCREF(self);
10554 PyList_SET_ITEM(out, 0, self);
10555 return out;
10556 }
10557 buf1 = PyUnicode_DATA(self);
10558 buf2 = PyUnicode_DATA(substring);
10559 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010560 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010561 if (!buf2)
10562 return NULL;
10563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010565 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010567 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10568 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010569 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010570 else
10571 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010572 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 break;
10574 case PyUnicode_2BYTE_KIND:
10575 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010576 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 break;
10578 case PyUnicode_4BYTE_KIND:
10579 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010580 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 break;
10582 default:
10583 out = NULL;
10584 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010585 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010586 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010587 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 return out;
10589}
10590
10591static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010592anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10593 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010595 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010597 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10598 return asciilib_find(buf1, len1, buf2, len2, offset);
10599 else
10600 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 case PyUnicode_2BYTE_KIND:
10602 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10603 case PyUnicode_4BYTE_KIND:
10604 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10605 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010606 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607}
10608
10609static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010610anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10611 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010613 switch (kind) {
10614 case PyUnicode_1BYTE_KIND:
10615 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10616 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10617 else
10618 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10619 case PyUnicode_2BYTE_KIND:
10620 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10621 case PyUnicode_4BYTE_KIND:
10622 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10623 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010624 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010625}
10626
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010627static void
10628replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10629 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10630{
10631 int kind = PyUnicode_KIND(u);
10632 void *data = PyUnicode_DATA(u);
10633 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10634 if (kind == PyUnicode_1BYTE_KIND) {
10635 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10636 (Py_UCS1 *)data + len,
10637 u1, u2, maxcount);
10638 }
10639 else if (kind == PyUnicode_2BYTE_KIND) {
10640 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10641 (Py_UCS2 *)data + len,
10642 u1, u2, maxcount);
10643 }
10644 else {
10645 assert(kind == PyUnicode_4BYTE_KIND);
10646 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10647 (Py_UCS4 *)data + len,
10648 u1, u2, maxcount);
10649 }
10650}
10651
Alexander Belopolsky40018472011-02-26 01:02:56 +000010652static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653replace(PyObject *self, PyObject *str1,
10654 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010657 const char *sbuf = PyUnicode_DATA(self);
10658 const void *buf1 = PyUnicode_DATA(str1);
10659 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 int srelease = 0, release1 = 0, release2 = 0;
10661 int skind = PyUnicode_KIND(self);
10662 int kind1 = PyUnicode_KIND(str1);
10663 int kind2 = PyUnicode_KIND(str2);
10664 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10665 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10666 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010667 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010668 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010670 if (slen < len1)
10671 goto nothing;
10672
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010674 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010675 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010676 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677
Victor Stinner59de0ee2011-10-07 10:01:28 +020010678 if (str1 == str2)
10679 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680
Victor Stinner49a0a212011-10-12 23:46:10 +020010681 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010682 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10683 if (maxchar < maxchar_str1)
10684 /* substring too wide to be present */
10685 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010686 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10687 /* Replacing str1 with str2 may cause a maxchar reduction in the
10688 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010689 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010690 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010693 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010695 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010698 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010699 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010700
Victor Stinner69ed0f42013-04-09 21:48:24 +020010701 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010702 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010703 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010704 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010705 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010706 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010707 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010709
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010710 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10711 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010712 }
10713 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 int rkind = skind;
10715 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010716 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 if (kind1 < rkind) {
10719 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010720 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 if (!buf1) goto error;
10722 release1 = 1;
10723 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010724 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010725 if (i < 0)
10726 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 if (rkind > kind2) {
10728 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010729 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 if (!buf2) goto error;
10731 release2 = 1;
10732 }
10733 else if (rkind < kind2) {
10734 /* widen self and buf1 */
10735 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010736 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010737 assert(buf1 != PyUnicode_DATA(str1));
10738 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010739 buf1 = PyUnicode_DATA(str1);
10740 release1 = 0;
10741 }
10742 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 if (!sbuf) goto error;
10744 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010745 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 if (!buf1) goto error;
10747 release1 = 1;
10748 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010749 u = PyUnicode_New(slen, maxchar);
10750 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010752 assert(PyUnicode_KIND(u) == rkind);
10753 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010754
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010755 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010756 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010757 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010759 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010761
10762 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010763 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010764 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010765 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010766 if (i == -1)
10767 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010768 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010770 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010772 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010774 }
10775 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010777 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 int rkind = skind;
10779 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010782 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010783 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 if (!buf1) goto error;
10785 release1 = 1;
10786 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010787 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010788 if (n == 0)
10789 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010791 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010792 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 if (!buf2) goto error;
10794 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010797 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010799 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 if (!sbuf) goto error;
10801 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010802 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010803 assert(buf1 != PyUnicode_DATA(str1));
10804 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010805 buf1 = PyUnicode_DATA(str1);
10806 release1 = 0;
10807 }
10808 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 if (!buf1) goto error;
10810 release1 = 1;
10811 }
10812 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10813 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010814 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 PyErr_SetString(PyExc_OverflowError,
10816 "replace string is too long");
10817 goto error;
10818 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010819 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010820 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010821 _Py_INCREF_UNICODE_EMPTY();
10822 if (!unicode_empty)
10823 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010824 u = unicode_empty;
10825 goto done;
10826 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010827 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 PyErr_SetString(PyExc_OverflowError,
10829 "replace string is too long");
10830 goto error;
10831 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010832 u = PyUnicode_New(new_size, maxchar);
10833 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010835 assert(PyUnicode_KIND(u) == rkind);
10836 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 ires = i = 0;
10838 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010839 while (n-- > 0) {
10840 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010841 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010842 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010843 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010844 if (j == -1)
10845 break;
10846 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010847 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010848 memcpy(res + rkind * ires,
10849 sbuf + rkind * i,
10850 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010852 }
10853 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010855 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010857 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010863 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010864 memcpy(res + rkind * ires,
10865 sbuf + rkind * i,
10866 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010867 }
10868 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010869 /* interleave */
10870 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010871 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010873 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010875 if (--n <= 0)
10876 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010877 memcpy(res + rkind * ires,
10878 sbuf + rkind * i,
10879 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 ires++;
10881 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010882 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010883 memcpy(res + rkind * ires,
10884 sbuf + rkind * i,
10885 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010886 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010887 }
10888
10889 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010890 unicode_adjust_maxchar(&u);
10891 if (u == NULL)
10892 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010894
10895 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010896 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10897 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10898 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010900 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010901 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010902 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010904 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010905 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010907
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010909 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010910 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10911 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10912 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010914 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010915 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010916 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010918 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010919 return unicode_result_unchanged(self);
10920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010922 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10923 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10924 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10925 if (srelease)
10926 PyMem_FREE((void *)sbuf);
10927 if (release1)
10928 PyMem_FREE((void *)buf1);
10929 if (release2)
10930 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932}
10933
10934/* --- Unicode Object Methods --------------------------------------------- */
10935
INADA Naoki3ae20562017-01-16 20:41:20 +090010936/*[clinic input]
10937str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938
INADA Naoki3ae20562017-01-16 20:41:20 +090010939Return a version of the string where each word is titlecased.
10940
10941More specifically, words start with uppercased characters and all remaining
10942cased characters have lower case.
10943[clinic start generated code]*/
10944
10945static PyObject *
10946unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010947/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010949 if (PyUnicode_READY(self) == -1)
10950 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010951 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952}
10953
INADA Naoki3ae20562017-01-16 20:41:20 +090010954/*[clinic input]
10955str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956
INADA Naoki3ae20562017-01-16 20:41:20 +090010957Return a capitalized version of the string.
10958
10959More specifically, make the first character have upper case and the rest lower
10960case.
10961[clinic start generated code]*/
10962
10963static PyObject *
10964unicode_capitalize_impl(PyObject *self)
10965/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010967 if (PyUnicode_READY(self) == -1)
10968 return NULL;
10969 if (PyUnicode_GET_LENGTH(self) == 0)
10970 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010971 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972}
10973
INADA Naoki3ae20562017-01-16 20:41:20 +090010974/*[clinic input]
10975str.casefold as unicode_casefold
10976
10977Return a version of the string suitable for caseless comparisons.
10978[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010979
10980static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010981unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010982/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010983{
10984 if (PyUnicode_READY(self) == -1)
10985 return NULL;
10986 if (PyUnicode_IS_ASCII(self))
10987 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010988 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010989}
10990
10991
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010992/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010993
10994static int
10995convert_uc(PyObject *obj, void *addr)
10996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010998
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010999 if (!PyUnicode_Check(obj)) {
11000 PyErr_Format(PyExc_TypeError,
11001 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011002 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011003 return 0;
11004 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011005 if (PyUnicode_READY(obj) < 0)
11006 return 0;
11007 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011008 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011010 return 0;
11011 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011012 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011013 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011014}
11015
INADA Naoki3ae20562017-01-16 20:41:20 +090011016/*[clinic input]
11017str.center as unicode_center
11018
11019 width: Py_ssize_t
11020 fillchar: Py_UCS4 = ' '
11021 /
11022
11023Return a centered string of length width.
11024
11025Padding is done using the specified fill character (default is a space).
11026[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027
11028static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011029unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11030/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011032 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033
Benjamin Petersonbac79492012-01-14 13:34:47 -050011034 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 return NULL;
11036
Victor Stinnerc4b49542011-12-11 22:44:26 +010011037 if (PyUnicode_GET_LENGTH(self) >= width)
11038 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039
Victor Stinnerc4b49542011-12-11 22:44:26 +010011040 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041 left = marg / 2 + (marg & width & 1);
11042
Victor Stinner9310abb2011-10-05 00:59:23 +020011043 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044}
11045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046/* This function assumes that str1 and str2 are readied by the caller. */
11047
Marc-André Lemburge5034372000-08-08 08:04:29 +000011048static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011049unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011050{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011051#define COMPARE(TYPE1, TYPE2) \
11052 do { \
11053 TYPE1* p1 = (TYPE1 *)data1; \
11054 TYPE2* p2 = (TYPE2 *)data2; \
11055 TYPE1* end = p1 + len; \
11056 Py_UCS4 c1, c2; \
11057 for (; p1 != end; p1++, p2++) { \
11058 c1 = *p1; \
11059 c2 = *p2; \
11060 if (c1 != c2) \
11061 return (c1 < c2) ? -1 : 1; \
11062 } \
11063 } \
11064 while (0)
11065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011067 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011068 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 kind1 = PyUnicode_KIND(str1);
11071 kind2 = PyUnicode_KIND(str2);
11072 data1 = PyUnicode_DATA(str1);
11073 data2 = PyUnicode_DATA(str2);
11074 len1 = PyUnicode_GET_LENGTH(str1);
11075 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011076 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011077
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011078 switch(kind1) {
11079 case PyUnicode_1BYTE_KIND:
11080 {
11081 switch(kind2) {
11082 case PyUnicode_1BYTE_KIND:
11083 {
11084 int cmp = memcmp(data1, data2, len);
11085 /* normalize result of memcmp() into the range [-1; 1] */
11086 if (cmp < 0)
11087 return -1;
11088 if (cmp > 0)
11089 return 1;
11090 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011091 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011092 case PyUnicode_2BYTE_KIND:
11093 COMPARE(Py_UCS1, Py_UCS2);
11094 break;
11095 case PyUnicode_4BYTE_KIND:
11096 COMPARE(Py_UCS1, Py_UCS4);
11097 break;
11098 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011099 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011100 }
11101 break;
11102 }
11103 case PyUnicode_2BYTE_KIND:
11104 {
11105 switch(kind2) {
11106 case PyUnicode_1BYTE_KIND:
11107 COMPARE(Py_UCS2, Py_UCS1);
11108 break;
11109 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011110 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011111 COMPARE(Py_UCS2, Py_UCS2);
11112 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011113 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011114 case PyUnicode_4BYTE_KIND:
11115 COMPARE(Py_UCS2, Py_UCS4);
11116 break;
11117 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011118 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011119 }
11120 break;
11121 }
11122 case PyUnicode_4BYTE_KIND:
11123 {
11124 switch(kind2) {
11125 case PyUnicode_1BYTE_KIND:
11126 COMPARE(Py_UCS4, Py_UCS1);
11127 break;
11128 case PyUnicode_2BYTE_KIND:
11129 COMPARE(Py_UCS4, Py_UCS2);
11130 break;
11131 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011132 {
11133#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11134 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11135 /* normalize result of wmemcmp() into the range [-1; 1] */
11136 if (cmp < 0)
11137 return -1;
11138 if (cmp > 0)
11139 return 1;
11140#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011141 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011142#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011143 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011144 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011145 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011146 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011147 }
11148 break;
11149 }
11150 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011151 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011152 }
11153
Victor Stinner770e19e2012-10-04 22:59:45 +020011154 if (len1 == len2)
11155 return 0;
11156 if (len1 < len2)
11157 return -1;
11158 else
11159 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011160
11161#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011162}
11163
Benjamin Peterson621b4302016-09-09 13:54:34 -070011164static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011165unicode_compare_eq(PyObject *str1, PyObject *str2)
11166{
11167 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011168 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011169 Py_ssize_t len;
11170 int cmp;
11171
Victor Stinnere5567ad2012-10-23 02:48:49 +020011172 len = PyUnicode_GET_LENGTH(str1);
11173 if (PyUnicode_GET_LENGTH(str2) != len)
11174 return 0;
11175 kind = PyUnicode_KIND(str1);
11176 if (PyUnicode_KIND(str2) != kind)
11177 return 0;
11178 data1 = PyUnicode_DATA(str1);
11179 data2 = PyUnicode_DATA(str2);
11180
11181 cmp = memcmp(data1, data2, len * kind);
11182 return (cmp == 0);
11183}
11184
11185
Alexander Belopolsky40018472011-02-26 01:02:56 +000011186int
11187PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11190 if (PyUnicode_READY(left) == -1 ||
11191 PyUnicode_READY(right) == -1)
11192 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011193
11194 /* a string is equal to itself */
11195 if (left == right)
11196 return 0;
11197
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011198 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011200 PyErr_Format(PyExc_TypeError,
11201 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011202 Py_TYPE(left)->tp_name,
11203 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204 return -1;
11205}
11206
Martin v. Löwis5b222132007-06-10 09:51:05 +000011207int
11208PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 Py_ssize_t i;
11211 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011213 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214
Victor Stinner910337b2011-10-03 03:20:16 +020011215 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011216 if (!PyUnicode_IS_READY(uni)) {
11217 const wchar_t *ws = _PyUnicode_WSTR(uni);
11218 /* Compare Unicode string and source character set string */
11219 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11220 if (chr != ustr[i])
11221 return (chr < ustr[i]) ? -1 : 1;
11222 }
11223 /* This check keeps Python strings that end in '\0' from comparing equal
11224 to C strings identical up to that point. */
11225 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11226 return 1; /* uni is longer */
11227 if (ustr[i])
11228 return -1; /* str is longer */
11229 return 0;
11230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011232 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011233 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011234 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011235 size_t len, len2 = strlen(str);
11236 int cmp;
11237
11238 len = Py_MIN(len1, len2);
11239 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011240 if (cmp != 0) {
11241 if (cmp < 0)
11242 return -1;
11243 else
11244 return 1;
11245 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011246 if (len1 > len2)
11247 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011248 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011249 return -1; /* str is longer */
11250 return 0;
11251 }
11252 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011253 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011254 /* Compare Unicode string and source character set string */
11255 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011256 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011257 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11258 /* This check keeps Python strings that end in '\0' from comparing equal
11259 to C strings identical up to that point. */
11260 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11261 return 1; /* uni is longer */
11262 if (str[i])
11263 return -1; /* str is longer */
11264 return 0;
11265 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011266}
11267
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011268static int
11269non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11270{
11271 size_t i, len;
11272 const wchar_t *p;
11273 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11274 if (strlen(str) != len)
11275 return 0;
11276 p = _PyUnicode_WSTR(unicode);
11277 assert(p);
11278 for (i = 0; i < len; i++) {
11279 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011280 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011281 return 0;
11282 }
11283 return 1;
11284}
11285
11286int
11287_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11288{
11289 size_t len;
11290 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011291 assert(str);
11292#ifndef NDEBUG
11293 for (const char *p = str; *p; p++) {
11294 assert((unsigned char)*p < 128);
11295 }
11296#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011297 if (PyUnicode_READY(unicode) == -1) {
11298 /* Memory error or bad data */
11299 PyErr_Clear();
11300 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11301 }
11302 if (!PyUnicode_IS_ASCII(unicode))
11303 return 0;
11304 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11305 return strlen(str) == len &&
11306 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11307}
11308
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011309int
11310_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11311{
11312 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011313
11314 assert(_PyUnicode_CHECK(left));
11315 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011316#ifndef NDEBUG
11317 for (const char *p = right->string; *p; p++) {
11318 assert((unsigned char)*p < 128);
11319 }
11320#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011321
11322 if (PyUnicode_READY(left) == -1) {
11323 /* memory error or bad data */
11324 PyErr_Clear();
11325 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11326 }
11327
11328 if (!PyUnicode_IS_ASCII(left))
11329 return 0;
11330
11331 right_uni = _PyUnicode_FromId(right); /* borrowed */
11332 if (right_uni == NULL) {
11333 /* memory error or bad data */
11334 PyErr_Clear();
11335 return _PyUnicode_EqualToASCIIString(left, right->string);
11336 }
11337
11338 if (left == right_uni)
11339 return 1;
11340
11341 if (PyUnicode_CHECK_INTERNED(left))
11342 return 0;
11343
Victor Stinner607b1022020-05-05 18:50:30 +020011344#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011345 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011346 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011347 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11348 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011349#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011350
11351 return unicode_compare_eq(left, right_uni);
11352}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011353
Alexander Belopolsky40018472011-02-26 01:02:56 +000011354PyObject *
11355PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011356{
11357 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011358
Victor Stinnere5567ad2012-10-23 02:48:49 +020011359 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11360 Py_RETURN_NOTIMPLEMENTED;
11361
11362 if (PyUnicode_READY(left) == -1 ||
11363 PyUnicode_READY(right) == -1)
11364 return NULL;
11365
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011366 if (left == right) {
11367 switch (op) {
11368 case Py_EQ:
11369 case Py_LE:
11370 case Py_GE:
11371 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011372 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011373 case Py_NE:
11374 case Py_LT:
11375 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011376 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011377 default:
11378 PyErr_BadArgument();
11379 return NULL;
11380 }
11381 }
11382 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011383 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011384 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011385 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011386 }
11387 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011388 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011389 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011390 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011391}
11392
Alexander Belopolsky40018472011-02-26 01:02:56 +000011393int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011394_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11395{
11396 return unicode_eq(aa, bb);
11397}
11398
11399int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011400PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011401{
Victor Stinner77282cb2013-04-14 19:22:47 +020011402 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011403 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011405 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011406
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011407 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011409 "'in <string>' requires string as left operand, not %.100s",
11410 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011411 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011412 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011413 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011414 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011415 if (ensure_unicode(str) < 0)
11416 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011419 kind2 = PyUnicode_KIND(substr);
11420 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011421 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011423 len2 = PyUnicode_GET_LENGTH(substr);
11424 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011425 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011426 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011427 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011428 if (len2 == 1) {
11429 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11430 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011431 return result;
11432 }
11433 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011434 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011435 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011436 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438
Victor Stinner77282cb2013-04-14 19:22:47 +020011439 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 case PyUnicode_1BYTE_KIND:
11441 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11442 break;
11443 case PyUnicode_2BYTE_KIND:
11444 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11445 break;
11446 case PyUnicode_4BYTE_KIND:
11447 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11448 break;
11449 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011450 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011452
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011453 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011454 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011455 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456
Guido van Rossum403d68b2000-03-13 15:55:09 +000011457 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011458}
11459
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460/* Concat to string or Unicode object giving a new Unicode object. */
11461
Alexander Belopolsky40018472011-02-26 01:02:56 +000011462PyObject *
11463PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011465 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011466 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011467 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011469 if (ensure_unicode(left) < 0)
11470 return NULL;
11471
11472 if (!PyUnicode_Check(right)) {
11473 PyErr_Format(PyExc_TypeError,
11474 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011475 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011476 return NULL;
11477 }
11478 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011479 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480
11481 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011482 if (left == unicode_empty)
11483 return PyUnicode_FromObject(right);
11484 if (right == unicode_empty)
11485 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011487 left_len = PyUnicode_GET_LENGTH(left);
11488 right_len = PyUnicode_GET_LENGTH(right);
11489 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011490 PyErr_SetString(PyExc_OverflowError,
11491 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011492 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011493 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011494 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011495
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011496 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11497 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011498 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011501 result = PyUnicode_New(new_len, maxchar);
11502 if (result == NULL)
11503 return NULL;
11504 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11505 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11506 assert(_PyUnicode_CheckConsistency(result, 1));
11507 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508}
11509
Walter Dörwald1ab83302007-05-18 17:15:44 +000011510void
Victor Stinner23e56682011-10-03 03:54:37 +020011511PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011512{
Victor Stinner23e56682011-10-03 03:54:37 +020011513 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011514 Py_UCS4 maxchar, maxchar2;
11515 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011516
11517 if (p_left == NULL) {
11518 if (!PyErr_Occurred())
11519 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011520 return;
11521 }
Victor Stinner23e56682011-10-03 03:54:37 +020011522 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011523 if (right == NULL || left == NULL
11524 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011525 if (!PyErr_Occurred())
11526 PyErr_BadInternalCall();
11527 goto error;
11528 }
11529
Benjamin Petersonbac79492012-01-14 13:34:47 -050011530 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011531 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011532 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011533 goto error;
11534
Victor Stinner488fa492011-12-12 00:01:39 +010011535 /* Shortcuts */
11536 if (left == unicode_empty) {
11537 Py_DECREF(left);
11538 Py_INCREF(right);
11539 *p_left = right;
11540 return;
11541 }
11542 if (right == unicode_empty)
11543 return;
11544
11545 left_len = PyUnicode_GET_LENGTH(left);
11546 right_len = PyUnicode_GET_LENGTH(right);
11547 if (left_len > PY_SSIZE_T_MAX - right_len) {
11548 PyErr_SetString(PyExc_OverflowError,
11549 "strings are too large to concat");
11550 goto error;
11551 }
11552 new_len = left_len + right_len;
11553
11554 if (unicode_modifiable(left)
11555 && PyUnicode_CheckExact(right)
11556 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011557 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11558 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011559 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011560 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011561 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11562 {
11563 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011564 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011565 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011566
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011567 /* copy 'right' into the newly allocated area of 'left' */
11568 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011569 }
Victor Stinner488fa492011-12-12 00:01:39 +010011570 else {
11571 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11572 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011573 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011574
Victor Stinner488fa492011-12-12 00:01:39 +010011575 /* Concat the two Unicode strings */
11576 res = PyUnicode_New(new_len, maxchar);
11577 if (res == NULL)
11578 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011579 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11580 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011581 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011582 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011583 }
11584 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011585 return;
11586
11587error:
Victor Stinner488fa492011-12-12 00:01:39 +010011588 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011589}
11590
11591void
11592PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11593{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011594 PyUnicode_Append(pleft, right);
11595 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011596}
11597
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011598/*
11599Wraps stringlib_parse_args_finds() and additionally ensures that the
11600first argument is a unicode object.
11601*/
11602
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011603static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011604parse_args_finds_unicode(const char * function_name, PyObject *args,
11605 PyObject **substring,
11606 Py_ssize_t *start, Py_ssize_t *end)
11607{
11608 if(stringlib_parse_args_finds(function_name, args, substring,
11609 start, end)) {
11610 if (ensure_unicode(*substring) < 0)
11611 return 0;
11612 return 1;
11613 }
11614 return 0;
11615}
11616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011617PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011618 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011620Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011621string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011622interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
11624static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011625unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011627 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011628 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011629 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011631 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011632 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011635 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 kind1 = PyUnicode_KIND(self);
11639 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011640 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011641 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 len1 = PyUnicode_GET_LENGTH(self);
11644 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011646 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011647 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011648
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011649 buf1 = PyUnicode_DATA(self);
11650 buf2 = PyUnicode_DATA(substring);
11651 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011652 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011653 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011654 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011655 }
11656 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 case PyUnicode_1BYTE_KIND:
11658 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011659 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660 buf2, len2, PY_SSIZE_T_MAX
11661 );
11662 break;
11663 case PyUnicode_2BYTE_KIND:
11664 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011665 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 buf2, len2, PY_SSIZE_T_MAX
11667 );
11668 break;
11669 case PyUnicode_4BYTE_KIND:
11670 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011671 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 buf2, len2, PY_SSIZE_T_MAX
11673 );
11674 break;
11675 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011676 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 }
11678
11679 result = PyLong_FromSsize_t(iresult);
11680
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011681 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011682 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011683 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685 return result;
11686}
11687
INADA Naoki3ae20562017-01-16 20:41:20 +090011688/*[clinic input]
11689str.encode as unicode_encode
11690
11691 encoding: str(c_default="NULL") = 'utf-8'
11692 The encoding in which to encode the string.
11693 errors: str(c_default="NULL") = 'strict'
11694 The error handling scheme to use for encoding errors.
11695 The default is 'strict' meaning that encoding errors raise a
11696 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11697 'xmlcharrefreplace' as well as any other name registered with
11698 codecs.register_error that can handle UnicodeEncodeErrors.
11699
11700Encode the string using the codec registered for encoding.
11701[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
11703static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011704unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011705/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011707 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011708}
11709
INADA Naoki3ae20562017-01-16 20:41:20 +090011710/*[clinic input]
11711str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
INADA Naoki3ae20562017-01-16 20:41:20 +090011713 tabsize: int = 8
11714
11715Return a copy where all tab characters are expanded using spaces.
11716
11717If tabsize is not given, a tab size of 8 characters is assumed.
11718[clinic start generated code]*/
11719
11720static PyObject *
11721unicode_expandtabs_impl(PyObject *self, int tabsize)
11722/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011724 Py_ssize_t i, j, line_pos, src_len, incr;
11725 Py_UCS4 ch;
11726 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011727 const void *src_data;
11728 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011729 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011730 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
Antoine Pitrou22425222011-10-04 19:10:51 +020011732 if (PyUnicode_READY(self) == -1)
11733 return NULL;
11734
Thomas Wouters7e474022000-07-16 12:04:32 +000011735 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011736 src_len = PyUnicode_GET_LENGTH(self);
11737 i = j = line_pos = 0;
11738 kind = PyUnicode_KIND(self);
11739 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011740 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011741 for (; i < src_len; i++) {
11742 ch = PyUnicode_READ(kind, src_data, i);
11743 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011744 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011746 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011748 goto overflow;
11749 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011750 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011751 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011755 goto overflow;
11756 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011758 if (ch == '\n' || ch == '\r')
11759 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011761 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011762 if (!found)
11763 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011764
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011766 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767 if (!u)
11768 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011769 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
Antoine Pitroue71d5742011-10-04 15:55:09 +020011771 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
Antoine Pitroue71d5742011-10-04 15:55:09 +020011773 for (; i < src_len; i++) {
11774 ch = PyUnicode_READ(kind, src_data, i);
11775 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011777 incr = tabsize - (line_pos % tabsize);
11778 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011779 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011780 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011782 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011784 line_pos++;
11785 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011786 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011787 if (ch == '\n' || ch == '\r')
11788 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011790 }
11791 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011792 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011793
Antoine Pitroue71d5742011-10-04 15:55:09 +020011794 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011795 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797}
11798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011799PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011800 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801\n\
11802Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011803such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804arguments start and end are interpreted as in slice notation.\n\
11805\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011806Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807
11808static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011811 /* initialize variables to prevent gcc warning */
11812 PyObject *substring = NULL;
11813 Py_ssize_t start = 0;
11814 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011815 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011817 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011820 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011823 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 if (result == -2)
11826 return NULL;
11827
Christian Heimes217cfd12007-12-02 14:31:20 +000011828 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829}
11830
11831static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011832unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011834 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011835 enum PyUnicode_Kind kind;
11836 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011837
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011838 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011839 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011841 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011842 if (PyUnicode_READY(self) == -1) {
11843 return NULL;
11844 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011845 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11846 PyErr_SetString(PyExc_IndexError, "string index out of range");
11847 return NULL;
11848 }
11849 kind = PyUnicode_KIND(self);
11850 data = PyUnicode_DATA(self);
11851 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011852 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853}
11854
Guido van Rossumc2504932007-09-18 19:42:40 +000011855/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011856 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011857static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011858unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011860 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011861
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011862#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011863 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011864#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 if (_PyUnicode_HASH(self) != -1)
11866 return _PyUnicode_HASH(self);
11867 if (PyUnicode_READY(self) == -1)
11868 return -1;
animalizea1d14252019-01-02 20:16:06 +080011869
Christian Heimes985ecdc2013-11-20 11:46:18 +010011870 x = _Py_HashBytes(PyUnicode_DATA(self),
11871 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011873 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874}
11875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011876PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878\n\
oldkaa0735f2018-02-02 16:52:55 +080011879Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011880such that sub is contained within S[start:end]. Optional\n\
11881arguments start and end are interpreted as in slice notation.\n\
11882\n\
11883Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
11885static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011888 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011889 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011890 PyObject *substring = NULL;
11891 Py_ssize_t start = 0;
11892 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011894 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011897 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011900 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 if (result == -2)
11903 return NULL;
11904
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905 if (result < 0) {
11906 PyErr_SetString(PyExc_ValueError, "substring not found");
11907 return NULL;
11908 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011909
Christian Heimes217cfd12007-12-02 14:31:20 +000011910 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911}
11912
INADA Naoki3ae20562017-01-16 20:41:20 +090011913/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011914str.isascii as unicode_isascii
11915
11916Return True if all characters in the string are ASCII, False otherwise.
11917
11918ASCII characters have code points in the range U+0000-U+007F.
11919Empty string is ASCII too.
11920[clinic start generated code]*/
11921
11922static PyObject *
11923unicode_isascii_impl(PyObject *self)
11924/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11925{
11926 if (PyUnicode_READY(self) == -1) {
11927 return NULL;
11928 }
11929 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11930}
11931
11932/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011933str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934
INADA Naoki3ae20562017-01-16 20:41:20 +090011935Return True if the string is a lowercase string, False otherwise.
11936
11937A string is lowercase if all cased characters in the string are lowercase and
11938there is at least one cased character in the string.
11939[clinic start generated code]*/
11940
11941static PyObject *
11942unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011943/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 Py_ssize_t i, length;
11946 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011947 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 int cased;
11949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 if (PyUnicode_READY(self) == -1)
11951 return NULL;
11952 length = PyUnicode_GET_LENGTH(self);
11953 kind = PyUnicode_KIND(self);
11954 data = PyUnicode_DATA(self);
11955
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 if (length == 1)
11958 return PyBool_FromLong(
11959 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011961 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011963 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011964
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 for (i = 0; i < length; i++) {
11967 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011968
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011970 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 else if (!cased && Py_UNICODE_ISLOWER(ch))
11972 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011974 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975}
11976
INADA Naoki3ae20562017-01-16 20:41:20 +090011977/*[clinic input]
11978str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
INADA Naoki3ae20562017-01-16 20:41:20 +090011980Return True if the string is an uppercase string, False otherwise.
11981
11982A string is uppercase if all cased characters in the string are uppercase and
11983there is at least one cased character in the string.
11984[clinic start generated code]*/
11985
11986static PyObject *
11987unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011988/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 Py_ssize_t i, length;
11991 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011992 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993 int cased;
11994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 if (PyUnicode_READY(self) == -1)
11996 return NULL;
11997 length = PyUnicode_GET_LENGTH(self);
11998 kind = PyUnicode_KIND(self);
11999 data = PyUnicode_DATA(self);
12000
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (length == 1)
12003 return PyBool_FromLong(
12004 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012006 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012008 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012009
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 for (i = 0; i < length; i++) {
12012 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012013
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012015 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012016 else if (!cased && Py_UNICODE_ISUPPER(ch))
12017 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012019 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020}
12021
INADA Naoki3ae20562017-01-16 20:41:20 +090012022/*[clinic input]
12023str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024
INADA Naoki3ae20562017-01-16 20:41:20 +090012025Return True if the string is a title-cased string, False otherwise.
12026
12027In a title-cased string, upper- and title-case characters may only
12028follow uncased characters and lowercase characters only cased ones.
12029[clinic start generated code]*/
12030
12031static PyObject *
12032unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012033/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 Py_ssize_t i, length;
12036 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012037 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038 int cased, previous_is_cased;
12039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 if (PyUnicode_READY(self) == -1)
12041 return NULL;
12042 length = PyUnicode_GET_LENGTH(self);
12043 kind = PyUnicode_KIND(self);
12044 data = PyUnicode_DATA(self);
12045
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 if (length == 1) {
12048 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12049 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12050 (Py_UNICODE_ISUPPER(ch) != 0));
12051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012053 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012055 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012056
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057 cased = 0;
12058 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 for (i = 0; i < length; i++) {
12060 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012061
Benjamin Peterson29060642009-01-31 22:14:21 +000012062 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12063 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012064 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012065 previous_is_cased = 1;
12066 cased = 1;
12067 }
12068 else if (Py_UNICODE_ISLOWER(ch)) {
12069 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012070 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 previous_is_cased = 1;
12072 cased = 1;
12073 }
12074 else
12075 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012077 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078}
12079
INADA Naoki3ae20562017-01-16 20:41:20 +090012080/*[clinic input]
12081str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082
INADA Naoki3ae20562017-01-16 20:41:20 +090012083Return True if the string is a whitespace string, False otherwise.
12084
12085A string is whitespace if all characters in the string are whitespace and there
12086is at least one character in the string.
12087[clinic start generated code]*/
12088
12089static PyObject *
12090unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012091/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 Py_ssize_t i, length;
12094 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012095 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096
12097 if (PyUnicode_READY(self) == -1)
12098 return NULL;
12099 length = PyUnicode_GET_LENGTH(self);
12100 kind = PyUnicode_KIND(self);
12101 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 if (length == 1)
12105 return PyBool_FromLong(
12106 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012108 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012110 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 for (i = 0; i < length; i++) {
12113 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012114 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012115 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012117 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118}
12119
INADA Naoki3ae20562017-01-16 20:41:20 +090012120/*[clinic input]
12121str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012122
INADA Naoki3ae20562017-01-16 20:41:20 +090012123Return True if the string is an alphabetic string, False otherwise.
12124
12125A string is alphabetic if all characters in the string are alphabetic and there
12126is at least one character in the string.
12127[clinic start generated code]*/
12128
12129static PyObject *
12130unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012131/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 Py_ssize_t i, length;
12134 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012135 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136
12137 if (PyUnicode_READY(self) == -1)
12138 return NULL;
12139 length = PyUnicode_GET_LENGTH(self);
12140 kind = PyUnicode_KIND(self);
12141 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012142
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012143 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 if (length == 1)
12145 return PyBool_FromLong(
12146 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012147
12148 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012150 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 for (i = 0; i < length; i++) {
12153 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012154 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012155 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012156 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012157}
12158
INADA Naoki3ae20562017-01-16 20:41:20 +090012159/*[clinic input]
12160str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012161
INADA Naoki3ae20562017-01-16 20:41:20 +090012162Return True if the string is an alpha-numeric string, False otherwise.
12163
12164A string is alpha-numeric if all characters in the string are alpha-numeric and
12165there is at least one character in the string.
12166[clinic start generated code]*/
12167
12168static PyObject *
12169unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012170/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012171{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012173 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 Py_ssize_t len, i;
12175
12176 if (PyUnicode_READY(self) == -1)
12177 return NULL;
12178
12179 kind = PyUnicode_KIND(self);
12180 data = PyUnicode_DATA(self);
12181 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012182
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012183 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 if (len == 1) {
12185 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12186 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12187 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012188
12189 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012191 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 for (i = 0; i < len; i++) {
12194 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012195 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012196 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012197 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012198 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012199}
12200
INADA Naoki3ae20562017-01-16 20:41:20 +090012201/*[clinic input]
12202str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203
INADA Naoki3ae20562017-01-16 20:41:20 +090012204Return True if the string is a decimal string, False otherwise.
12205
12206A string is a decimal string if all characters in the string are decimal and
12207there is at least one character in the string.
12208[clinic start generated code]*/
12209
12210static PyObject *
12211unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012212/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 Py_ssize_t i, length;
12215 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012216 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217
12218 if (PyUnicode_READY(self) == -1)
12219 return NULL;
12220 length = PyUnicode_GET_LENGTH(self);
12221 kind = PyUnicode_KIND(self);
12222 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 if (length == 1)
12226 return PyBool_FromLong(
12227 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012229 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012231 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 for (i = 0; i < length; i++) {
12234 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012235 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012237 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238}
12239
INADA Naoki3ae20562017-01-16 20:41:20 +090012240/*[clinic input]
12241str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242
INADA Naoki3ae20562017-01-16 20:41:20 +090012243Return True if the string is a digit string, False otherwise.
12244
12245A string is a digit string if all characters in the string are digits and there
12246is at least one character in the string.
12247[clinic start generated code]*/
12248
12249static PyObject *
12250unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012251/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 Py_ssize_t i, length;
12254 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012255 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256
12257 if (PyUnicode_READY(self) == -1)
12258 return NULL;
12259 length = PyUnicode_GET_LENGTH(self);
12260 kind = PyUnicode_KIND(self);
12261 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 if (length == 1) {
12265 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12266 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12267 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012269 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012271 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 for (i = 0; i < length; i++) {
12274 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012275 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012277 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278}
12279
INADA Naoki3ae20562017-01-16 20:41:20 +090012280/*[clinic input]
12281str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
INADA Naoki3ae20562017-01-16 20:41:20 +090012283Return True if the string is a numeric string, False otherwise.
12284
12285A string is numeric if all characters in the string are numeric and there is at
12286least one character in the string.
12287[clinic start generated code]*/
12288
12289static PyObject *
12290unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012291/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 Py_ssize_t i, length;
12294 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012295 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296
12297 if (PyUnicode_READY(self) == -1)
12298 return NULL;
12299 length = PyUnicode_GET_LENGTH(self);
12300 kind = PyUnicode_KIND(self);
12301 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 if (length == 1)
12305 return PyBool_FromLong(
12306 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012308 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012310 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 for (i = 0; i < length; i++) {
12313 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012314 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012315 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012316 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317}
12318
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012319Py_ssize_t
12320_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012321{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012323 if (PyUnicode_READY(self) == -1)
12324 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012325
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012326 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012327 if (len == 0) {
12328 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012329 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 }
12331
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012332 int kind = PyUnicode_KIND(self);
12333 const void *data = PyUnicode_DATA(self);
12334 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012335 /* PEP 3131 says that the first character must be in
12336 XID_Start and subsequent characters in XID_Continue,
12337 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012338 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012339 letters, digits, underscore). However, given the current
12340 definition of XID_Start and XID_Continue, it is sufficient
12341 to check just for these, except that _ must be allowed
12342 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012343 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012344 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012345 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012346
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012347 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012348 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012349 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012350 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012351 }
12352 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012353 return i;
12354}
12355
12356int
12357PyUnicode_IsIdentifier(PyObject *self)
12358{
12359 if (PyUnicode_IS_READY(self)) {
12360 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12361 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12362 /* an empty string is not a valid identifier */
12363 return len && i == len;
12364 }
12365 else {
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012366 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012367 if (len == 0) {
12368 /* an empty string is not a valid identifier */
12369 return 0;
12370 }
12371
12372 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012373 Py_UCS4 ch = wstr[i++];
12374#if SIZEOF_WCHAR_T == 2
12375 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12376 && i < len
12377 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12378 {
12379 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12380 i++;
12381 }
12382#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012383 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12384 return 0;
12385 }
12386
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012387 while (i < len) {
12388 ch = wstr[i++];
12389#if SIZEOF_WCHAR_T == 2
12390 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12391 && i < len
12392 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12393 {
12394 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12395 i++;
12396 }
12397#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012398 if (!_PyUnicode_IsXidContinue(ch)) {
12399 return 0;
12400 }
12401 }
12402 return 1;
12403 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012404}
12405
INADA Naoki3ae20562017-01-16 20:41:20 +090012406/*[clinic input]
12407str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012408
INADA Naoki3ae20562017-01-16 20:41:20 +090012409Return True if the string is a valid Python identifier, False otherwise.
12410
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012411Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012412such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012413[clinic start generated code]*/
12414
12415static PyObject *
12416unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012417/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012418{
12419 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12420}
12421
INADA Naoki3ae20562017-01-16 20:41:20 +090012422/*[clinic input]
12423str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012424
INADA Naoki3ae20562017-01-16 20:41:20 +090012425Return True if the string is printable, False otherwise.
12426
12427A string is printable if all of its characters are considered printable in
12428repr() or if it is empty.
12429[clinic start generated code]*/
12430
12431static PyObject *
12432unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012433/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012434{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435 Py_ssize_t i, length;
12436 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012437 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438
12439 if (PyUnicode_READY(self) == -1)
12440 return NULL;
12441 length = PyUnicode_GET_LENGTH(self);
12442 kind = PyUnicode_KIND(self);
12443 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012444
12445 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 if (length == 1)
12447 return PyBool_FromLong(
12448 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 for (i = 0; i < length; i++) {
12451 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012452 Py_RETURN_FALSE;
12453 }
12454 }
12455 Py_RETURN_TRUE;
12456}
12457
INADA Naoki3ae20562017-01-16 20:41:20 +090012458/*[clinic input]
12459str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012460
INADA Naoki3ae20562017-01-16 20:41:20 +090012461 iterable: object
12462 /
12463
12464Concatenate any number of strings.
12465
Martin Panter91a88662017-01-24 00:30:06 +000012466The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012467The result is returned as a new string.
12468
12469Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12470[clinic start generated code]*/
12471
12472static PyObject *
12473unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012474/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475{
INADA Naoki3ae20562017-01-16 20:41:20 +090012476 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477}
12478
Martin v. Löwis18e16552006-02-15 17:27:45 +000012479static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012480unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 if (PyUnicode_READY(self) == -1)
12483 return -1;
12484 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485}
12486
INADA Naoki3ae20562017-01-16 20:41:20 +090012487/*[clinic input]
12488str.ljust as unicode_ljust
12489
12490 width: Py_ssize_t
12491 fillchar: Py_UCS4 = ' '
12492 /
12493
12494Return a left-justified string of length width.
12495
12496Padding is done using the specified fill character (default is a space).
12497[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498
12499static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012500unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12501/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012503 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505
Victor Stinnerc4b49542011-12-11 22:44:26 +010012506 if (PyUnicode_GET_LENGTH(self) >= width)
12507 return unicode_result_unchanged(self);
12508
12509 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510}
12511
INADA Naoki3ae20562017-01-16 20:41:20 +090012512/*[clinic input]
12513str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514
INADA Naoki3ae20562017-01-16 20:41:20 +090012515Return a copy of the string converted to lowercase.
12516[clinic start generated code]*/
12517
12518static PyObject *
12519unicode_lower_impl(PyObject *self)
12520/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012522 if (PyUnicode_READY(self) == -1)
12523 return NULL;
12524 if (PyUnicode_IS_ASCII(self))
12525 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012526 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527}
12528
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012529#define LEFTSTRIP 0
12530#define RIGHTSTRIP 1
12531#define BOTHSTRIP 2
12532
12533/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012534static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012535
INADA Naoki3ae20562017-01-16 20:41:20 +090012536#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012537
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012538/* externally visible for str.strip(unicode) */
12539PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012540_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012541{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012542 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 int kind;
12544 Py_ssize_t i, j, len;
12545 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012546 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12549 return NULL;
12550
12551 kind = PyUnicode_KIND(self);
12552 data = PyUnicode_DATA(self);
12553 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012554 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12556 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012557 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012558
Benjamin Peterson14339b62009-01-31 16:36:08 +000012559 i = 0;
12560 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012561 while (i < len) {
12562 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12563 if (!BLOOM(sepmask, ch))
12564 break;
12565 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12566 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012567 i++;
12568 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012569 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012570
Benjamin Peterson14339b62009-01-31 16:36:08 +000012571 j = len;
12572 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012573 j--;
12574 while (j >= i) {
12575 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12576 if (!BLOOM(sepmask, ch))
12577 break;
12578 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12579 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012581 }
12582
Benjamin Peterson29060642009-01-31 22:14:21 +000012583 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012584 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012585
Victor Stinner7931d9a2011-11-04 00:22:48 +010012586 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587}
12588
12589PyObject*
12590PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12591{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012592 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012594 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595
Victor Stinnerde636f32011-10-01 03:55:54 +020012596 if (PyUnicode_READY(self) == -1)
12597 return NULL;
12598
Victor Stinner684d5fd2012-05-03 02:32:34 +020012599 length = PyUnicode_GET_LENGTH(self);
12600 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012601
Victor Stinner684d5fd2012-05-03 02:32:34 +020012602 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012603 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604
Victor Stinnerde636f32011-10-01 03:55:54 +020012605 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012606 PyErr_SetString(PyExc_IndexError, "string index out of range");
12607 return NULL;
12608 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012609 if (start >= length || end < start)
12610 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012611
Victor Stinner684d5fd2012-05-03 02:32:34 +020012612 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012613 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012614 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012615 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012616 }
12617 else {
12618 kind = PyUnicode_KIND(self);
12619 data = PyUnicode_1BYTE_DATA(self);
12620 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012621 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012622 length);
12623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625
12626static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012627do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 Py_ssize_t len, i, j;
12630
12631 if (PyUnicode_READY(self) == -1)
12632 return NULL;
12633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012635
Victor Stinnercc7af722013-04-09 22:39:24 +020012636 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012637 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012638
12639 i = 0;
12640 if (striptype != RIGHTSTRIP) {
12641 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012642 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012643 if (!_Py_ascii_whitespace[ch])
12644 break;
12645 i++;
12646 }
12647 }
12648
12649 j = len;
12650 if (striptype != LEFTSTRIP) {
12651 j--;
12652 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012653 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012654 if (!_Py_ascii_whitespace[ch])
12655 break;
12656 j--;
12657 }
12658 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012659 }
12660 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012661 else {
12662 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012663 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012664
Victor Stinnercc7af722013-04-09 22:39:24 +020012665 i = 0;
12666 if (striptype != RIGHTSTRIP) {
12667 while (i < len) {
12668 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12669 if (!Py_UNICODE_ISSPACE(ch))
12670 break;
12671 i++;
12672 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012673 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012674
12675 j = len;
12676 if (striptype != LEFTSTRIP) {
12677 j--;
12678 while (j >= i) {
12679 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12680 if (!Py_UNICODE_ISSPACE(ch))
12681 break;
12682 j--;
12683 }
12684 j++;
12685 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012686 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012687
Victor Stinner7931d9a2011-11-04 00:22:48 +010012688 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689}
12690
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012691
12692static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012693do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012694{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012695 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012696 if (PyUnicode_Check(sep))
12697 return _PyUnicode_XStrip(self, striptype, sep);
12698 else {
12699 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012700 "%s arg must be None or str",
12701 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012702 return NULL;
12703 }
12704 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012705
Benjamin Peterson14339b62009-01-31 16:36:08 +000012706 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012707}
12708
12709
INADA Naoki3ae20562017-01-16 20:41:20 +090012710/*[clinic input]
12711str.strip as unicode_strip
12712
12713 chars: object = None
12714 /
12715
Zachary Ware09895c22019-10-09 16:09:00 -050012716Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012717
12718If chars is given and not None, remove characters in chars instead.
12719[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012720
12721static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012722unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012723/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012724{
INADA Naoki3ae20562017-01-16 20:41:20 +090012725 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012726}
12727
12728
INADA Naoki3ae20562017-01-16 20:41:20 +090012729/*[clinic input]
12730str.lstrip as unicode_lstrip
12731
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012732 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012733 /
12734
12735Return a copy of the string with leading whitespace removed.
12736
12737If chars is given and not None, remove characters in chars instead.
12738[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012739
12740static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012741unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012742/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012743{
INADA Naoki3ae20562017-01-16 20:41:20 +090012744 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012745}
12746
12747
INADA Naoki3ae20562017-01-16 20:41:20 +090012748/*[clinic input]
12749str.rstrip as unicode_rstrip
12750
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012751 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012752 /
12753
12754Return a copy of the string with trailing whitespace removed.
12755
12756If chars is given and not None, remove characters in chars instead.
12757[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012758
12759static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012760unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012761/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012762{
INADA Naoki3ae20562017-01-16 20:41:20 +090012763 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012764}
12765
12766
Guido van Rossumd57fd912000-03-10 22:53:23 +000012767static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012768unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012770 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772
Serhiy Storchaka05997252013-01-26 12:14:02 +020012773 if (len < 1)
12774 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775
Victor Stinnerc4b49542011-12-11 22:44:26 +010012776 /* no repeat, return original string */
12777 if (len == 1)
12778 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012779
Benjamin Petersonbac79492012-01-14 13:34:47 -050012780 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781 return NULL;
12782
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012783 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012784 PyErr_SetString(PyExc_OverflowError,
12785 "repeated string is too long");
12786 return NULL;
12787 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012789
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012790 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791 if (!u)
12792 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012793 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012796 int kind = PyUnicode_KIND(str);
12797 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012798 if (kind == PyUnicode_1BYTE_KIND) {
12799 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012800 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012801 }
12802 else if (kind == PyUnicode_2BYTE_KIND) {
12803 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012804 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012805 ucs2[n] = fill_char;
12806 } else {
12807 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12808 assert(kind == PyUnicode_4BYTE_KIND);
12809 for (n = 0; n < len; ++n)
12810 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812 }
12813 else {
12814 /* number of characters copied this far */
12815 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012816 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012817 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012818 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012822 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012823 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825 }
12826
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012827 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012828 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829}
12830
Alexander Belopolsky40018472011-02-26 01:02:56 +000012831PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012832PyUnicode_Replace(PyObject *str,
12833 PyObject *substr,
12834 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012835 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012837 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12838 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012839 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012840 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841}
12842
INADA Naoki3ae20562017-01-16 20:41:20 +090012843/*[clinic input]
12844str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845
INADA Naoki3ae20562017-01-16 20:41:20 +090012846 old: unicode
12847 new: unicode
12848 count: Py_ssize_t = -1
12849 Maximum number of occurrences to replace.
12850 -1 (the default value) means replace all occurrences.
12851 /
12852
12853Return a copy with all occurrences of substring old replaced by new.
12854
12855If the optional argument count is given, only the first count occurrences are
12856replaced.
12857[clinic start generated code]*/
12858
12859static PyObject *
12860unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12861 Py_ssize_t count)
12862/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012864 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012866 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867}
12868
sweeneydea81849b2020-04-22 17:05:48 -040012869/*[clinic input]
12870str.removeprefix as unicode_removeprefix
12871
12872 prefix: unicode
12873 /
12874
12875Return a str with the given prefix string removed if present.
12876
12877If the string starts with the prefix string, return string[len(prefix):].
12878Otherwise, return a copy of the original string.
12879[clinic start generated code]*/
12880
12881static PyObject *
12882unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12883/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12884{
12885 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12886 if (match == -1) {
12887 return NULL;
12888 }
12889 if (match) {
12890 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12891 PyUnicode_GET_LENGTH(self));
12892 }
12893 return unicode_result_unchanged(self);
12894}
12895
12896/*[clinic input]
12897str.removesuffix as unicode_removesuffix
12898
12899 suffix: unicode
12900 /
12901
12902Return a str with the given suffix string removed if present.
12903
12904If the string ends with the suffix string and that suffix is not empty,
12905return string[:-len(suffix)]. Otherwise, return a copy of the original
12906string.
12907[clinic start generated code]*/
12908
12909static PyObject *
12910unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12911/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12912{
12913 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12914 if (match == -1) {
12915 return NULL;
12916 }
12917 if (match) {
12918 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12919 - PyUnicode_GET_LENGTH(suffix));
12920 }
12921 return unicode_result_unchanged(self);
12922}
12923
Alexander Belopolsky40018472011-02-26 01:02:56 +000012924static PyObject *
12925unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012927 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928 Py_ssize_t isize;
12929 Py_ssize_t osize, squote, dquote, i, o;
12930 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012931 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012932 const void *idata;
12933 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012936 return NULL;
12937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 isize = PyUnicode_GET_LENGTH(unicode);
12939 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012941 /* Compute length of output, quote characters, and
12942 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012943 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944 max = 127;
12945 squote = dquote = 0;
12946 ikind = PyUnicode_KIND(unicode);
12947 for (i = 0; i < isize; i++) {
12948 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012949 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012951 case '\'': squote++; break;
12952 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012954 incr = 2;
12955 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 default:
12957 /* Fast-path ASCII */
12958 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012959 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012961 ;
12962 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012963 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012965 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012967 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012969 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012971 if (osize > PY_SSIZE_T_MAX - incr) {
12972 PyErr_SetString(PyExc_OverflowError,
12973 "string is too long to generate repr");
12974 return NULL;
12975 }
12976 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 }
12978
12979 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012980 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012982 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 if (dquote)
12984 /* Both squote and dquote present. Use squote,
12985 and escape them */
12986 osize += squote;
12987 else
12988 quote = '"';
12989 }
Victor Stinner55c08782013-04-14 18:45:39 +020012990 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991
12992 repr = PyUnicode_New(osize, max);
12993 if (repr == NULL)
12994 return NULL;
12995 okind = PyUnicode_KIND(repr);
12996 odata = PyUnicode_DATA(repr);
12997
12998 PyUnicode_WRITE(okind, odata, 0, quote);
12999 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013000 if (unchanged) {
13001 _PyUnicode_FastCopyCharacters(repr, 1,
13002 unicode, 0,
13003 isize);
13004 }
13005 else {
13006 for (i = 0, o = 1; i < isize; i++) {
13007 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008
Victor Stinner55c08782013-04-14 18:45:39 +020013009 /* Escape quotes and backslashes */
13010 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013011 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013013 continue;
13014 }
13015
13016 /* Map special whitespace to '\t', \n', '\r' */
13017 if (ch == '\t') {
13018 PyUnicode_WRITE(okind, odata, o++, '\\');
13019 PyUnicode_WRITE(okind, odata, o++, 't');
13020 }
13021 else if (ch == '\n') {
13022 PyUnicode_WRITE(okind, odata, o++, '\\');
13023 PyUnicode_WRITE(okind, odata, o++, 'n');
13024 }
13025 else if (ch == '\r') {
13026 PyUnicode_WRITE(okind, odata, o++, '\\');
13027 PyUnicode_WRITE(okind, odata, o++, 'r');
13028 }
13029
13030 /* Map non-printable US ASCII to '\xhh' */
13031 else if (ch < ' ' || ch == 0x7F) {
13032 PyUnicode_WRITE(okind, odata, o++, '\\');
13033 PyUnicode_WRITE(okind, odata, o++, 'x');
13034 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13035 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13036 }
13037
13038 /* Copy ASCII characters as-is */
13039 else if (ch < 0x7F) {
13040 PyUnicode_WRITE(okind, odata, o++, ch);
13041 }
13042
13043 /* Non-ASCII characters */
13044 else {
13045 /* Map Unicode whitespace and control characters
13046 (categories Z* and C* except ASCII space)
13047 */
13048 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13049 PyUnicode_WRITE(okind, odata, o++, '\\');
13050 /* Map 8-bit characters to '\xhh' */
13051 if (ch <= 0xff) {
13052 PyUnicode_WRITE(okind, odata, o++, 'x');
13053 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13055 }
13056 /* Map 16-bit characters to '\uxxxx' */
13057 else if (ch <= 0xffff) {
13058 PyUnicode_WRITE(okind, odata, o++, 'u');
13059 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13060 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13063 }
13064 /* Map 21-bit characters to '\U00xxxxxx' */
13065 else {
13066 PyUnicode_WRITE(okind, odata, o++, 'U');
13067 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13068 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13069 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13070 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13071 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13072 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13073 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13074 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13075 }
13076 }
13077 /* Copy characters as-is */
13078 else {
13079 PyUnicode_WRITE(okind, odata, o++, ch);
13080 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013081 }
13082 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013084 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013085 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013086 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087}
13088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013089PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013090 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091\n\
13092Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013093such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094arguments start and end are interpreted as in slice notation.\n\
13095\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013096Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097
13098static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013101 /* initialize variables to prevent gcc warning */
13102 PyObject *substring = NULL;
13103 Py_ssize_t start = 0;
13104 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013105 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013107 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013110 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013111 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013113 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013115 if (result == -2)
13116 return NULL;
13117
Christian Heimes217cfd12007-12-02 14:31:20 +000013118 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119}
13120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013121PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013124Return the highest index in S where substring sub is found,\n\
13125such that sub is contained within S[start:end]. Optional\n\
13126arguments start and end are interpreted as in slice notation.\n\
13127\n\
13128Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129
13130static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013133 /* initialize variables to prevent gcc warning */
13134 PyObject *substring = NULL;
13135 Py_ssize_t start = 0;
13136 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013137 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013139 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013140 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013142 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013144
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013145 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147 if (result == -2)
13148 return NULL;
13149
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150 if (result < 0) {
13151 PyErr_SetString(PyExc_ValueError, "substring not found");
13152 return NULL;
13153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013154
Christian Heimes217cfd12007-12-02 14:31:20 +000013155 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156}
13157
INADA Naoki3ae20562017-01-16 20:41:20 +090013158/*[clinic input]
13159str.rjust as unicode_rjust
13160
13161 width: Py_ssize_t
13162 fillchar: Py_UCS4 = ' '
13163 /
13164
13165Return a right-justified string of length width.
13166
13167Padding is done using the specified fill character (default is a space).
13168[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013169
13170static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013171unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13172/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013173{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013174 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013175 return NULL;
13176
Victor Stinnerc4b49542011-12-11 22:44:26 +010013177 if (PyUnicode_GET_LENGTH(self) >= width)
13178 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179
Victor Stinnerc4b49542011-12-11 22:44:26 +010013180 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013181}
13182
Alexander Belopolsky40018472011-02-26 01:02:56 +000013183PyObject *
13184PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013186 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013188
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013189 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190}
13191
INADA Naoki3ae20562017-01-16 20:41:20 +090013192/*[clinic input]
13193str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194
INADA Naoki3ae20562017-01-16 20:41:20 +090013195 sep: object = None
13196 The delimiter according which to split the string.
13197 None (the default value) means split according to any whitespace,
13198 and discard empty strings from the result.
13199 maxsplit: Py_ssize_t = -1
13200 Maximum number of splits to do.
13201 -1 (the default value) means no limit.
13202
13203Return a list of the words in the string, using sep as the delimiter string.
13204[clinic start generated code]*/
13205
13206static PyObject *
13207unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13208/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209{
INADA Naoki3ae20562017-01-16 20:41:20 +090013210 if (sep == Py_None)
13211 return split(self, NULL, maxsplit);
13212 if (PyUnicode_Check(sep))
13213 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013214
Victor Stinner998b8062018-09-12 00:23:25 +020013215 PyErr_Format(PyExc_TypeError,
13216 "must be str or None, not %.100s",
13217 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013218 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219}
13220
Thomas Wouters477c8d52006-05-27 19:21:47 +000013221PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013222PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013223{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013224 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013225 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013226 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013228
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013229 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013230 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013231
Victor Stinner14f8f022011-10-05 20:58:25 +020013232 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 len1 = PyUnicode_GET_LENGTH(str_obj);
13235 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013236 if (kind1 < kind2 || len1 < len2) {
13237 _Py_INCREF_UNICODE_EMPTY();
13238 if (!unicode_empty)
13239 out = NULL;
13240 else {
13241 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13242 Py_DECREF(unicode_empty);
13243 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013244 return out;
13245 }
13246 buf1 = PyUnicode_DATA(str_obj);
13247 buf2 = PyUnicode_DATA(sep_obj);
13248 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013249 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013250 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013251 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013253
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013254 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013255 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013256 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13257 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13258 else
13259 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 break;
13261 case PyUnicode_2BYTE_KIND:
13262 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13263 break;
13264 case PyUnicode_4BYTE_KIND:
13265 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13266 break;
13267 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013268 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013269 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013270
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013271 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013272 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013273 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013274
13275 return out;
13276}
13277
13278
13279PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013280PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013281{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013282 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013283 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013284 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013285 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013286
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013287 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013288 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013289
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013290 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013291 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292 len1 = PyUnicode_GET_LENGTH(str_obj);
13293 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013294 if (kind1 < kind2 || len1 < len2) {
13295 _Py_INCREF_UNICODE_EMPTY();
13296 if (!unicode_empty)
13297 out = NULL;
13298 else {
13299 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13300 Py_DECREF(unicode_empty);
13301 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013302 return out;
13303 }
13304 buf1 = PyUnicode_DATA(str_obj);
13305 buf2 = PyUnicode_DATA(sep_obj);
13306 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013307 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013308 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013309 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013312 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013314 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13315 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13316 else
13317 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 break;
13319 case PyUnicode_2BYTE_KIND:
13320 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13321 break;
13322 case PyUnicode_4BYTE_KIND:
13323 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13324 break;
13325 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013326 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013327 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013328
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013329 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013330 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013331 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013332
13333 return out;
13334}
13335
INADA Naoki3ae20562017-01-16 20:41:20 +090013336/*[clinic input]
13337str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013338
INADA Naoki3ae20562017-01-16 20:41:20 +090013339 sep: object
13340 /
13341
13342Partition the string into three parts using the given separator.
13343
13344This will search for the separator in the string. If the separator is found,
13345returns a 3-tuple containing the part before the separator, the separator
13346itself, and the part after it.
13347
13348If the separator is not found, returns a 3-tuple containing the original string
13349and two empty strings.
13350[clinic start generated code]*/
13351
13352static PyObject *
13353unicode_partition(PyObject *self, PyObject *sep)
13354/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013355{
INADA Naoki3ae20562017-01-16 20:41:20 +090013356 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013357}
13358
INADA Naoki3ae20562017-01-16 20:41:20 +090013359/*[clinic input]
13360str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013361
INADA Naoki3ae20562017-01-16 20:41:20 +090013362Partition the string into three parts using the given separator.
13363
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013364This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013365the separator is found, returns a 3-tuple containing the part before the
13366separator, the separator itself, and the part after it.
13367
13368If the separator is not found, returns a 3-tuple containing two empty strings
13369and the original string.
13370[clinic start generated code]*/
13371
13372static PyObject *
13373unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013374/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013375{
INADA Naoki3ae20562017-01-16 20:41:20 +090013376 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013377}
13378
Alexander Belopolsky40018472011-02-26 01:02:56 +000013379PyObject *
13380PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013381{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013382 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013383 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013384
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013385 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013386}
13387
INADA Naoki3ae20562017-01-16 20:41:20 +090013388/*[clinic input]
13389str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013390
INADA Naoki3ae20562017-01-16 20:41:20 +090013391Return a list of the words in the string, using sep as the delimiter string.
13392
13393Splits are done starting at the end of the string and working to the front.
13394[clinic start generated code]*/
13395
13396static PyObject *
13397unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13398/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013399{
INADA Naoki3ae20562017-01-16 20:41:20 +090013400 if (sep == Py_None)
13401 return rsplit(self, NULL, maxsplit);
13402 if (PyUnicode_Check(sep))
13403 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013404
Victor Stinner998b8062018-09-12 00:23:25 +020013405 PyErr_Format(PyExc_TypeError,
13406 "must be str or None, not %.100s",
13407 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013408 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013409}
13410
INADA Naoki3ae20562017-01-16 20:41:20 +090013411/*[clinic input]
13412str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013413
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013414 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013415
13416Return a list of the lines in the string, breaking at line boundaries.
13417
13418Line breaks are not included in the resulting list unless keepends is given and
13419true.
13420[clinic start generated code]*/
13421
13422static PyObject *
13423unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013424/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013425{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013426 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427}
13428
13429static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013430PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013432 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013433}
13434
INADA Naoki3ae20562017-01-16 20:41:20 +090013435/*[clinic input]
13436str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437
INADA Naoki3ae20562017-01-16 20:41:20 +090013438Convert uppercase characters to lowercase and lowercase characters to uppercase.
13439[clinic start generated code]*/
13440
13441static PyObject *
13442unicode_swapcase_impl(PyObject *self)
13443/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013445 if (PyUnicode_READY(self) == -1)
13446 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013447 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448}
13449
Larry Hastings61272b72014-01-07 12:41:53 -080013450/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013451
Larry Hastings31826802013-10-19 00:09:25 -070013452@staticmethod
13453str.maketrans as unicode_maketrans
13454
13455 x: object
13456
13457 y: unicode=NULL
13458
13459 z: unicode=NULL
13460
13461 /
13462
13463Return a translation table usable for str.translate().
13464
13465If there is only one argument, it must be a dictionary mapping Unicode
13466ordinals (integers) or characters to Unicode ordinals, strings or None.
13467Character keys will be then converted to ordinals.
13468If there are two arguments, they must be strings of equal length, and
13469in the resulting dictionary, each character in x will be mapped to the
13470character at the same position in y. If there is a third argument, it
13471must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013472[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013473
Larry Hastings31826802013-10-19 00:09:25 -070013474static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013475unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013476/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013477{
Georg Brandlceee0772007-11-27 23:48:05 +000013478 PyObject *new = NULL, *key, *value;
13479 Py_ssize_t i = 0;
13480 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013481
Georg Brandlceee0772007-11-27 23:48:05 +000013482 new = PyDict_New();
13483 if (!new)
13484 return NULL;
13485 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013486 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013487 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013488
Georg Brandlceee0772007-11-27 23:48:05 +000013489 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013490 if (!PyUnicode_Check(x)) {
13491 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13492 "be a string if there is a second argument");
13493 goto err;
13494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013495 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013496 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13497 "arguments must have equal length");
13498 goto err;
13499 }
13500 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013501 x_kind = PyUnicode_KIND(x);
13502 y_kind = PyUnicode_KIND(y);
13503 x_data = PyUnicode_DATA(x);
13504 y_data = PyUnicode_DATA(y);
13505 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13506 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013507 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013508 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013509 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013510 if (!value) {
13511 Py_DECREF(key);
13512 goto err;
13513 }
Georg Brandlceee0772007-11-27 23:48:05 +000013514 res = PyDict_SetItem(new, key, value);
13515 Py_DECREF(key);
13516 Py_DECREF(value);
13517 if (res < 0)
13518 goto err;
13519 }
13520 /* create entries for deleting chars in z */
13521 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013522 z_kind = PyUnicode_KIND(z);
13523 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013524 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013525 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013526 if (!key)
13527 goto err;
13528 res = PyDict_SetItem(new, key, Py_None);
13529 Py_DECREF(key);
13530 if (res < 0)
13531 goto err;
13532 }
13533 }
13534 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013535 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013536 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013537
Georg Brandlceee0772007-11-27 23:48:05 +000013538 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013539 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013540 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13541 "to maketrans it must be a dict");
13542 goto err;
13543 }
13544 /* copy entries into the new dict, converting string keys to int keys */
13545 while (PyDict_Next(x, &i, &key, &value)) {
13546 if (PyUnicode_Check(key)) {
13547 /* convert string keys to integer keys */
13548 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013549 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013550 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13551 "table must be of length 1");
13552 goto err;
13553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013554 kind = PyUnicode_KIND(key);
13555 data = PyUnicode_DATA(key);
13556 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013557 if (!newkey)
13558 goto err;
13559 res = PyDict_SetItem(new, newkey, value);
13560 Py_DECREF(newkey);
13561 if (res < 0)
13562 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013563 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013564 /* just keep integer keys */
13565 if (PyDict_SetItem(new, key, value) < 0)
13566 goto err;
13567 } else {
13568 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13569 "be strings or integers");
13570 goto err;
13571 }
13572 }
13573 }
13574 return new;
13575 err:
13576 Py_DECREF(new);
13577 return NULL;
13578}
13579
INADA Naoki3ae20562017-01-16 20:41:20 +090013580/*[clinic input]
13581str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013582
INADA Naoki3ae20562017-01-16 20:41:20 +090013583 table: object
13584 Translation table, which must be a mapping of Unicode ordinals to
13585 Unicode ordinals, strings, or None.
13586 /
13587
13588Replace each character in the string using the given translation table.
13589
13590The table must implement lookup/indexing via __getitem__, for instance a
13591dictionary or list. If this operation raises LookupError, the character is
13592left untouched. Characters mapped to None are deleted.
13593[clinic start generated code]*/
13594
13595static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013596unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013597/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013598{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013599 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013600}
13601
INADA Naoki3ae20562017-01-16 20:41:20 +090013602/*[clinic input]
13603str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013604
INADA Naoki3ae20562017-01-16 20:41:20 +090013605Return a copy of the string converted to uppercase.
13606[clinic start generated code]*/
13607
13608static PyObject *
13609unicode_upper_impl(PyObject *self)
13610/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013611{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013612 if (PyUnicode_READY(self) == -1)
13613 return NULL;
13614 if (PyUnicode_IS_ASCII(self))
13615 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013616 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013617}
13618
INADA Naoki3ae20562017-01-16 20:41:20 +090013619/*[clinic input]
13620str.zfill as unicode_zfill
13621
13622 width: Py_ssize_t
13623 /
13624
13625Pad a numeric string with zeros on the left, to fill a field of the given width.
13626
13627The string is never truncated.
13628[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013629
13630static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013631unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013632/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013633{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013634 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013635 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013636 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013637 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013638 Py_UCS4 chr;
13639
Benjamin Petersonbac79492012-01-14 13:34:47 -050013640 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013641 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013642
Victor Stinnerc4b49542011-12-11 22:44:26 +010013643 if (PyUnicode_GET_LENGTH(self) >= width)
13644 return unicode_result_unchanged(self);
13645
13646 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013647
13648 u = pad(self, fill, 0, '0');
13649
Walter Dörwald068325e2002-04-15 13:36:47 +000013650 if (u == NULL)
13651 return NULL;
13652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013653 kind = PyUnicode_KIND(u);
13654 data = PyUnicode_DATA(u);
13655 chr = PyUnicode_READ(kind, data, fill);
13656
13657 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013658 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013659 PyUnicode_WRITE(kind, data, 0, chr);
13660 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013661 }
13662
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013663 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013664 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013665}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013666
13667#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013668static PyObject *
13669unicode__decimal2ascii(PyObject *self)
13670{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013671 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013672}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013673#endif
13674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013675PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013676 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013677\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013678Return True if S starts with the specified prefix, False otherwise.\n\
13679With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013680With optional end, stop comparing S at that position.\n\
13681prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013682
13683static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013684unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013685 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013686{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013687 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013688 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013689 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013690 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013691 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013692
Jesus Ceaac451502011-04-20 17:09:23 +020013693 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013694 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013695 if (PyTuple_Check(subobj)) {
13696 Py_ssize_t i;
13697 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013698 substring = PyTuple_GET_ITEM(subobj, i);
13699 if (!PyUnicode_Check(substring)) {
13700 PyErr_Format(PyExc_TypeError,
13701 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013702 "not %.100s",
13703 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013704 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013705 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013706 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013707 if (result == -1)
13708 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013709 if (result) {
13710 Py_RETURN_TRUE;
13711 }
13712 }
13713 /* nothing matched */
13714 Py_RETURN_FALSE;
13715 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013716 if (!PyUnicode_Check(subobj)) {
13717 PyErr_Format(PyExc_TypeError,
13718 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013719 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013720 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013721 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013722 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013723 if (result == -1)
13724 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013725 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726}
13727
13728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013729PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013730 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013731\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013732Return True if S ends with the specified suffix, False otherwise.\n\
13733With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013734With optional end, stop comparing S at that position.\n\
13735suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013736
13737static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013738unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013739 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013740{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013741 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013742 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013743 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013744 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013745 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013746
Jesus Ceaac451502011-04-20 17:09:23 +020013747 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013748 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013749 if (PyTuple_Check(subobj)) {
13750 Py_ssize_t i;
13751 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013752 substring = PyTuple_GET_ITEM(subobj, i);
13753 if (!PyUnicode_Check(substring)) {
13754 PyErr_Format(PyExc_TypeError,
13755 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013756 "not %.100s",
13757 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013758 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013759 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013760 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013761 if (result == -1)
13762 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013763 if (result) {
13764 Py_RETURN_TRUE;
13765 }
13766 }
13767 Py_RETURN_FALSE;
13768 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013769 if (!PyUnicode_Check(subobj)) {
13770 PyErr_Format(PyExc_TypeError,
13771 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013772 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013773 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013774 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013775 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013776 if (result == -1)
13777 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013778 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013779}
13780
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013781static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013782_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013783{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013784 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13785 writer->data = PyUnicode_DATA(writer->buffer);
13786
13787 if (!writer->readonly) {
13788 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013789 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013790 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013791 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013792 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13793 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13794 writer->kind = PyUnicode_WCHAR_KIND;
13795 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13796
Victor Stinner8f674cc2013-04-17 23:02:17 +020013797 /* Copy-on-write mode: set buffer size to 0 so
13798 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13799 * next write. */
13800 writer->size = 0;
13801 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013802}
13803
Victor Stinnerd3f08822012-05-29 12:57:52 +020013804void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013805_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013806{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013807 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013808
13809 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013810 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013811
13812 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13813 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13814 writer->kind = PyUnicode_WCHAR_KIND;
13815 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013816}
13817
Inada Naoki770847a2019-06-24 12:30:24 +090013818// Initialize _PyUnicodeWriter with initial buffer
13819static inline void
13820_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13821{
13822 memset(writer, 0, sizeof(*writer));
13823 writer->buffer = buffer;
13824 _PyUnicodeWriter_Update(writer);
13825 writer->min_length = writer->size;
13826}
13827
Victor Stinnerd3f08822012-05-29 12:57:52 +020013828int
13829_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13830 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013831{
13832 Py_ssize_t newlen;
13833 PyObject *newbuffer;
13834
Victor Stinner2740e462016-09-06 16:58:36 -070013835 assert(maxchar <= MAX_UNICODE);
13836
Victor Stinnerca9381e2015-09-22 00:58:32 +020013837 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013838 assert((maxchar > writer->maxchar && length >= 0)
13839 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013840
Victor Stinner202fdca2012-05-07 12:47:02 +020013841 if (length > PY_SSIZE_T_MAX - writer->pos) {
13842 PyErr_NoMemory();
13843 return -1;
13844 }
13845 newlen = writer->pos + length;
13846
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013847 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013848
Victor Stinnerd3f08822012-05-29 12:57:52 +020013849 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013850 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013851 if (writer->overallocate
13852 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13853 /* overallocate to limit the number of realloc() */
13854 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013855 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013856 if (newlen < writer->min_length)
13857 newlen = writer->min_length;
13858
Victor Stinnerd3f08822012-05-29 12:57:52 +020013859 writer->buffer = PyUnicode_New(newlen, maxchar);
13860 if (writer->buffer == NULL)
13861 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013862 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013863 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013864 if (writer->overallocate
13865 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13866 /* overallocate to limit the number of realloc() */
13867 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013868 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013869 if (newlen < writer->min_length)
13870 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013871
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013872 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013873 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013874 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013875 newbuffer = PyUnicode_New(newlen, maxchar);
13876 if (newbuffer == NULL)
13877 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013878 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13879 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013880 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013881 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013882 }
13883 else {
13884 newbuffer = resize_compact(writer->buffer, newlen);
13885 if (newbuffer == NULL)
13886 return -1;
13887 }
13888 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013889 }
13890 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013891 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013892 newbuffer = PyUnicode_New(writer->size, maxchar);
13893 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013894 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013895 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13896 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013897 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013898 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013899 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013900 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013901
13902#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013903}
13904
Victor Stinnerca9381e2015-09-22 00:58:32 +020013905int
13906_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13907 enum PyUnicode_Kind kind)
13908{
13909 Py_UCS4 maxchar;
13910
13911 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13912 assert(writer->kind < kind);
13913
13914 switch (kind)
13915 {
13916 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13917 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13918 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13919 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013920 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013921 }
13922
13923 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13924}
13925
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013926static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013927_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013928{
Victor Stinner2740e462016-09-06 16:58:36 -070013929 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013930 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13931 return -1;
13932 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13933 writer->pos++;
13934 return 0;
13935}
13936
13937int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013938_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13939{
13940 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13941}
13942
13943int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013944_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13945{
13946 Py_UCS4 maxchar;
13947 Py_ssize_t len;
13948
13949 if (PyUnicode_READY(str) == -1)
13950 return -1;
13951 len = PyUnicode_GET_LENGTH(str);
13952 if (len == 0)
13953 return 0;
13954 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13955 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013956 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013957 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013958 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013959 Py_INCREF(str);
13960 writer->buffer = str;
13961 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013962 writer->pos += len;
13963 return 0;
13964 }
13965 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13966 return -1;
13967 }
13968 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13969 str, 0, len);
13970 writer->pos += len;
13971 return 0;
13972}
13973
Victor Stinnere215d962012-10-06 23:03:36 +020013974int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013975_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13976 Py_ssize_t start, Py_ssize_t end)
13977{
13978 Py_UCS4 maxchar;
13979 Py_ssize_t len;
13980
13981 if (PyUnicode_READY(str) == -1)
13982 return -1;
13983
13984 assert(0 <= start);
13985 assert(end <= PyUnicode_GET_LENGTH(str));
13986 assert(start <= end);
13987
13988 if (end == 0)
13989 return 0;
13990
13991 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13992 return _PyUnicodeWriter_WriteStr(writer, str);
13993
13994 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13995 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13996 else
13997 maxchar = writer->maxchar;
13998 len = end - start;
13999
14000 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14001 return -1;
14002
14003 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14004 str, start, len);
14005 writer->pos += len;
14006 return 0;
14007}
14008
14009int
Victor Stinner4a587072013-11-19 12:54:53 +010014010_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14011 const char *ascii, Py_ssize_t len)
14012{
14013 if (len == -1)
14014 len = strlen(ascii);
14015
Andy Lestere6be9b52020-02-11 20:28:35 -060014016 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014017
14018 if (writer->buffer == NULL && !writer->overallocate) {
14019 PyObject *str;
14020
14021 str = _PyUnicode_FromASCII(ascii, len);
14022 if (str == NULL)
14023 return -1;
14024
14025 writer->readonly = 1;
14026 writer->buffer = str;
14027 _PyUnicodeWriter_Update(writer);
14028 writer->pos += len;
14029 return 0;
14030 }
14031
14032 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14033 return -1;
14034
14035 switch (writer->kind)
14036 {
14037 case PyUnicode_1BYTE_KIND:
14038 {
14039 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14040 Py_UCS1 *data = writer->data;
14041
Christian Heimesf051e432016-09-13 20:22:02 +020014042 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014043 break;
14044 }
14045 case PyUnicode_2BYTE_KIND:
14046 {
14047 _PyUnicode_CONVERT_BYTES(
14048 Py_UCS1, Py_UCS2,
14049 ascii, ascii + len,
14050 (Py_UCS2 *)writer->data + writer->pos);
14051 break;
14052 }
14053 case PyUnicode_4BYTE_KIND:
14054 {
14055 _PyUnicode_CONVERT_BYTES(
14056 Py_UCS1, Py_UCS4,
14057 ascii, ascii + len,
14058 (Py_UCS4 *)writer->data + writer->pos);
14059 break;
14060 }
14061 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014062 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014063 }
14064
14065 writer->pos += len;
14066 return 0;
14067}
14068
14069int
14070_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14071 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014072{
14073 Py_UCS4 maxchar;
14074
Andy Lestere6be9b52020-02-11 20:28:35 -060014075 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014076 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14077 return -1;
14078 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14079 writer->pos += len;
14080 return 0;
14081}
14082
Victor Stinnerd3f08822012-05-29 12:57:52 +020014083PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014084_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014085{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014086 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014087
Victor Stinnerd3f08822012-05-29 12:57:52 +020014088 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014089 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014090 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014091 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014092
14093 str = writer->buffer;
14094 writer->buffer = NULL;
14095
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014096 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014097 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14098 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014099 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014100
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014101 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14102 PyObject *str2;
14103 str2 = resize_compact(str, writer->pos);
14104 if (str2 == NULL) {
14105 Py_DECREF(str);
14106 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014107 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014108 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014109 }
14110
Victor Stinner15a0bd32013-07-08 22:29:55 +020014111 assert(_PyUnicode_CheckConsistency(str, 1));
14112 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014113}
14114
Victor Stinnerd3f08822012-05-29 12:57:52 +020014115void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014116_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014117{
14118 Py_CLEAR(writer->buffer);
14119}
14120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014121#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014122
14123PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014124 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014125\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014126Return a formatted version of S, using substitutions from args and kwargs.\n\
14127The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014128
Eric Smith27bbca62010-11-04 17:06:58 +000014129PyDoc_STRVAR(format_map__doc__,
14130 "S.format_map(mapping) -> str\n\
14131\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014132Return a formatted version of S, using substitutions from mapping.\n\
14133The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014134
INADA Naoki3ae20562017-01-16 20:41:20 +090014135/*[clinic input]
14136str.__format__ as unicode___format__
14137
14138 format_spec: unicode
14139 /
14140
14141Return a formatted version of the string as described by format_spec.
14142[clinic start generated code]*/
14143
Eric Smith4a7d76d2008-05-30 18:10:19 +000014144static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014145unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014146/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014147{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014148 _PyUnicodeWriter writer;
14149 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014150
Victor Stinnerd3f08822012-05-29 12:57:52 +020014151 if (PyUnicode_READY(self) == -1)
14152 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014153 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014154 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14155 self, format_spec, 0,
14156 PyUnicode_GET_LENGTH(format_spec));
14157 if (ret == -1) {
14158 _PyUnicodeWriter_Dealloc(&writer);
14159 return NULL;
14160 }
14161 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014162}
14163
INADA Naoki3ae20562017-01-16 20:41:20 +090014164/*[clinic input]
14165str.__sizeof__ as unicode_sizeof
14166
14167Return the size of the string in memory, in bytes.
14168[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014169
14170static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014171unicode_sizeof_impl(PyObject *self)
14172/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014174 Py_ssize_t size;
14175
14176 /* If it's a compact object, account for base structure +
14177 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014178 if (PyUnicode_IS_COMPACT_ASCII(self))
14179 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14180 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014181 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014182 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014183 else {
14184 /* If it is a two-block object, account for base object, and
14185 for character block if present. */
14186 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014187 if (_PyUnicode_DATA_ANY(self))
14188 size += (PyUnicode_GET_LENGTH(self) + 1) *
14189 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014190 }
14191 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014192 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014193 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14194 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14195 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14196 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014197
14198 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014199}
14200
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014201static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014202unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014203{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014204 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014205 if (!copy)
14206 return NULL;
14207 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014208}
14209
Guido van Rossumd57fd912000-03-10 22:53:23 +000014210static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014211 UNICODE_ENCODE_METHODDEF
14212 UNICODE_REPLACE_METHODDEF
14213 UNICODE_SPLIT_METHODDEF
14214 UNICODE_RSPLIT_METHODDEF
14215 UNICODE_JOIN_METHODDEF
14216 UNICODE_CAPITALIZE_METHODDEF
14217 UNICODE_CASEFOLD_METHODDEF
14218 UNICODE_TITLE_METHODDEF
14219 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014220 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014221 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014222 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014223 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014224 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014225 UNICODE_LJUST_METHODDEF
14226 UNICODE_LOWER_METHODDEF
14227 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014228 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14229 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014230 UNICODE_RJUST_METHODDEF
14231 UNICODE_RSTRIP_METHODDEF
14232 UNICODE_RPARTITION_METHODDEF
14233 UNICODE_SPLITLINES_METHODDEF
14234 UNICODE_STRIP_METHODDEF
14235 UNICODE_SWAPCASE_METHODDEF
14236 UNICODE_TRANSLATE_METHODDEF
14237 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014238 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14239 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014240 UNICODE_REMOVEPREFIX_METHODDEF
14241 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014242 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014243 UNICODE_ISLOWER_METHODDEF
14244 UNICODE_ISUPPER_METHODDEF
14245 UNICODE_ISTITLE_METHODDEF
14246 UNICODE_ISSPACE_METHODDEF
14247 UNICODE_ISDECIMAL_METHODDEF
14248 UNICODE_ISDIGIT_METHODDEF
14249 UNICODE_ISNUMERIC_METHODDEF
14250 UNICODE_ISALPHA_METHODDEF
14251 UNICODE_ISALNUM_METHODDEF
14252 UNICODE_ISIDENTIFIER_METHODDEF
14253 UNICODE_ISPRINTABLE_METHODDEF
14254 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014255 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014256 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014257 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014258 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014259 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014260#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014261 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014262 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014263#endif
14264
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014265 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014266 {NULL, NULL}
14267};
14268
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014269static PyObject *
14270unicode_mod(PyObject *v, PyObject *w)
14271{
Brian Curtindfc80e32011-08-10 20:28:54 -050014272 if (!PyUnicode_Check(v))
14273 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014274 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014275}
14276
14277static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014278 0, /*nb_add*/
14279 0, /*nb_subtract*/
14280 0, /*nb_multiply*/
14281 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014282};
14283
Guido van Rossumd57fd912000-03-10 22:53:23 +000014284static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014285 (lenfunc) unicode_length, /* sq_length */
14286 PyUnicode_Concat, /* sq_concat */
14287 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14288 (ssizeargfunc) unicode_getitem, /* sq_item */
14289 0, /* sq_slice */
14290 0, /* sq_ass_item */
14291 0, /* sq_ass_slice */
14292 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014293};
14294
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014295static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014296unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014298 if (PyUnicode_READY(self) == -1)
14299 return NULL;
14300
Victor Stinnera15e2602020-04-08 02:01:56 +020014301 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014302 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014303 if (i == -1 && PyErr_Occurred())
14304 return NULL;
14305 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014306 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014307 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014308 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014309 Py_ssize_t start, stop, step, slicelength, i;
14310 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014311 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014312 const void *src_data;
14313 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014314 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014315 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014316
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014317 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014318 return NULL;
14319 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014320 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14321 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014322
14323 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014324 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014325 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014326 slicelength == PyUnicode_GET_LENGTH(self)) {
14327 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014328 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014329 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014330 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014331 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014332 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014333 src_kind = PyUnicode_KIND(self);
14334 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014335 if (!PyUnicode_IS_ASCII(self)) {
14336 kind_limit = kind_maxchar_limit(src_kind);
14337 max_char = 0;
14338 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14339 ch = PyUnicode_READ(src_kind, src_data, cur);
14340 if (ch > max_char) {
14341 max_char = ch;
14342 if (max_char >= kind_limit)
14343 break;
14344 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014345 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014346 }
Victor Stinner55c99112011-10-13 01:17:06 +020014347 else
14348 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014349 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014350 if (result == NULL)
14351 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014352 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014353 dest_data = PyUnicode_DATA(result);
14354
14355 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014356 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14357 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014358 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014359 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014360 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014361 } else {
14362 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14363 return NULL;
14364 }
14365}
14366
14367static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014368 (lenfunc)unicode_length, /* mp_length */
14369 (binaryfunc)unicode_subscript, /* mp_subscript */
14370 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014371};
14372
Guido van Rossumd57fd912000-03-10 22:53:23 +000014373
Guido van Rossumd57fd912000-03-10 22:53:23 +000014374/* Helpers for PyUnicode_Format() */
14375
Victor Stinnera47082312012-10-04 02:19:54 +020014376struct unicode_formatter_t {
14377 PyObject *args;
14378 int args_owned;
14379 Py_ssize_t arglen, argidx;
14380 PyObject *dict;
14381
14382 enum PyUnicode_Kind fmtkind;
14383 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014384 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014385 PyObject *fmtstr;
14386
14387 _PyUnicodeWriter writer;
14388};
14389
14390struct unicode_format_arg_t {
14391 Py_UCS4 ch;
14392 int flags;
14393 Py_ssize_t width;
14394 int prec;
14395 int sign;
14396};
14397
Guido van Rossumd57fd912000-03-10 22:53:23 +000014398static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014399unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014400{
Victor Stinnera47082312012-10-04 02:19:54 +020014401 Py_ssize_t argidx = ctx->argidx;
14402
14403 if (argidx < ctx->arglen) {
14404 ctx->argidx++;
14405 if (ctx->arglen < 0)
14406 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014407 else
Victor Stinnera47082312012-10-04 02:19:54 +020014408 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014409 }
14410 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014411 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014412 return NULL;
14413}
14414
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014415/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014416
Victor Stinnera47082312012-10-04 02:19:54 +020014417/* Format a float into the writer if the writer is not NULL, or into *p_output
14418 otherwise.
14419
14420 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014421static int
Victor Stinnera47082312012-10-04 02:19:54 +020014422formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14423 PyObject **p_output,
14424 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014425{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014426 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014427 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014428 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014429 int prec;
14430 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014431
Guido van Rossumd57fd912000-03-10 22:53:23 +000014432 x = PyFloat_AsDouble(v);
14433 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014434 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014435
Victor Stinnera47082312012-10-04 02:19:54 +020014436 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014437 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014438 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014439
Victor Stinnera47082312012-10-04 02:19:54 +020014440 if (arg->flags & F_ALT)
14441 dtoa_flags = Py_DTSF_ALT;
14442 else
14443 dtoa_flags = 0;
14444 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014445 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014446 return -1;
14447 len = strlen(p);
14448 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014449 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014450 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014451 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014452 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014453 }
14454 else
14455 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014456 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014457 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014458}
14459
Victor Stinnerd0880d52012-04-27 23:40:13 +020014460/* formatlong() emulates the format codes d, u, o, x and X, and
14461 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14462 * Python's regular ints.
14463 * Return value: a new PyUnicodeObject*, or NULL if error.
14464 * The output string is of the form
14465 * "-"? ("0x" | "0X")? digit+
14466 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14467 * set in flags. The case of hex digits will be correct,
14468 * There will be at least prec digits, zero-filled on the left if
14469 * necessary to get that many.
14470 * val object to be converted
14471 * flags bitmask of format flags; only F_ALT is looked at
14472 * prec minimum number of digits; 0-fill on left if needed
14473 * type a character in [duoxX]; u acts the same as d
14474 *
14475 * CAUTION: o, x and X conversions on regular ints can never
14476 * produce a '-' sign, but can for Python's unbounded ints.
14477 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014478PyObject *
14479_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014480{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014481 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014482 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014483 Py_ssize_t i;
14484 int sign; /* 1 if '-', else 0 */
14485 int len; /* number of characters */
14486 Py_ssize_t llen;
14487 int numdigits; /* len == numnondigits + numdigits */
14488 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014489
Victor Stinnerd0880d52012-04-27 23:40:13 +020014490 /* Avoid exceeding SSIZE_T_MAX */
14491 if (prec > INT_MAX-3) {
14492 PyErr_SetString(PyExc_OverflowError,
14493 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014494 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014495 }
14496
14497 assert(PyLong_Check(val));
14498
14499 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014500 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014501 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014502 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014503 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014504 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014505 /* int and int subclasses should print numerically when a numeric */
14506 /* format code is used (see issue18780) */
14507 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014508 break;
14509 case 'o':
14510 numnondigits = 2;
14511 result = PyNumber_ToBase(val, 8);
14512 break;
14513 case 'x':
14514 case 'X':
14515 numnondigits = 2;
14516 result = PyNumber_ToBase(val, 16);
14517 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014518 }
14519 if (!result)
14520 return NULL;
14521
14522 assert(unicode_modifiable(result));
14523 assert(PyUnicode_IS_READY(result));
14524 assert(PyUnicode_IS_ASCII(result));
14525
14526 /* To modify the string in-place, there can only be one reference. */
14527 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014528 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014529 PyErr_BadInternalCall();
14530 return NULL;
14531 }
14532 buf = PyUnicode_DATA(result);
14533 llen = PyUnicode_GET_LENGTH(result);
14534 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014535 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014536 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014537 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014538 return NULL;
14539 }
14540 len = (int)llen;
14541 sign = buf[0] == '-';
14542 numnondigits += sign;
14543 numdigits = len - numnondigits;
14544 assert(numdigits > 0);
14545
14546 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014547 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014548 (type == 'o' || type == 'x' || type == 'X'))) {
14549 assert(buf[sign] == '0');
14550 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14551 buf[sign+1] == 'o');
14552 numnondigits -= 2;
14553 buf += 2;
14554 len -= 2;
14555 if (sign)
14556 buf[0] = '-';
14557 assert(len == numnondigits + numdigits);
14558 assert(numdigits > 0);
14559 }
14560
14561 /* Fill with leading zeroes to meet minimum width. */
14562 if (prec > numdigits) {
14563 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14564 numnondigits + prec);
14565 char *b1;
14566 if (!r1) {
14567 Py_DECREF(result);
14568 return NULL;
14569 }
14570 b1 = PyBytes_AS_STRING(r1);
14571 for (i = 0; i < numnondigits; ++i)
14572 *b1++ = *buf++;
14573 for (i = 0; i < prec - numdigits; i++)
14574 *b1++ = '0';
14575 for (i = 0; i < numdigits; i++)
14576 *b1++ = *buf++;
14577 *b1 = '\0';
14578 Py_DECREF(result);
14579 result = r1;
14580 buf = PyBytes_AS_STRING(result);
14581 len = numnondigits + prec;
14582 }
14583
14584 /* Fix up case for hex conversions. */
14585 if (type == 'X') {
14586 /* Need to convert all lower case letters to upper case.
14587 and need to convert 0x to 0X (and -0x to -0X). */
14588 for (i = 0; i < len; i++)
14589 if (buf[i] >= 'a' && buf[i] <= 'x')
14590 buf[i] -= 'a'-'A';
14591 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014592 if (!PyUnicode_Check(result)
14593 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014594 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014595 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014596 Py_DECREF(result);
14597 result = unicode;
14598 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014599 else if (len != PyUnicode_GET_LENGTH(result)) {
14600 if (PyUnicode_Resize(&result, len) < 0)
14601 Py_CLEAR(result);
14602 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014603 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014604}
14605
Ethan Furmandf3ed242014-01-05 06:50:30 -080014606/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014607 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014608 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014609 * -1 and raise an exception on error */
14610static int
Victor Stinnera47082312012-10-04 02:19:54 +020014611mainformatlong(PyObject *v,
14612 struct unicode_format_arg_t *arg,
14613 PyObject **p_output,
14614 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014615{
14616 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014617 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014618
14619 if (!PyNumber_Check(v))
14620 goto wrongtype;
14621
Ethan Furman9ab74802014-03-21 06:38:46 -070014622 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014623 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014624 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014625 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014626 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014627 if (PyErr_ExceptionMatches(PyExc_TypeError))
14628 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014629 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014630 }
14631 }
14632 else {
14633 iobj = PyNumber_Long(v);
14634 if (iobj == NULL ) {
14635 if (PyErr_ExceptionMatches(PyExc_TypeError))
14636 goto wrongtype;
14637 return -1;
14638 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014639 }
14640 assert(PyLong_Check(iobj));
14641 }
14642 else {
14643 iobj = v;
14644 Py_INCREF(iobj);
14645 }
14646
14647 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014648 && arg->width == -1 && arg->prec == -1
14649 && !(arg->flags & (F_SIGN | F_BLANK))
14650 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014651 {
14652 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014653 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014654 int base;
14655
Victor Stinnera47082312012-10-04 02:19:54 +020014656 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014657 {
14658 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014659 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014660 case 'd':
14661 case 'i':
14662 case 'u':
14663 base = 10;
14664 break;
14665 case 'o':
14666 base = 8;
14667 break;
14668 case 'x':
14669 case 'X':
14670 base = 16;
14671 break;
14672 }
14673
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014674 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14675 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014676 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014677 }
14678 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014679 return 1;
14680 }
14681
Ethan Furmanb95b5612015-01-23 20:05:18 -080014682 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014683 Py_DECREF(iobj);
14684 if (res == NULL)
14685 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014686 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014687 return 0;
14688
14689wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014690 switch(type)
14691 {
14692 case 'o':
14693 case 'x':
14694 case 'X':
14695 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014696 "%%%c format: an integer is required, "
14697 "not %.200s",
14698 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014699 break;
14700 default:
14701 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014702 "%%%c format: a number is required, "
14703 "not %.200s",
14704 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014705 break;
14706 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014707 return -1;
14708}
14709
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014710static Py_UCS4
14711formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014712{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014713 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014714 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014715 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014716 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014717 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014718 goto onError;
14719 }
14720 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014721 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014722 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014723 /* make sure number is a type of integer */
14724 if (!PyLong_Check(v)) {
14725 iobj = PyNumber_Index(v);
14726 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014727 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014728 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014729 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014730 Py_DECREF(iobj);
14731 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014732 else {
14733 x = PyLong_AsLong(v);
14734 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014735 if (x == -1 && PyErr_Occurred())
14736 goto onError;
14737
Victor Stinner8faf8212011-12-08 22:14:11 +010014738 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014739 PyErr_SetString(PyExc_OverflowError,
14740 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014741 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014742 }
14743
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014744 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014745 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014746
Benjamin Peterson29060642009-01-31 22:14:21 +000014747 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014748 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014749 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014750 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014751}
14752
Victor Stinnera47082312012-10-04 02:19:54 +020014753/* Parse options of an argument: flags, width, precision.
14754 Handle also "%(name)" syntax.
14755
14756 Return 0 if the argument has been formatted into arg->str.
14757 Return 1 if the argument has been written into ctx->writer,
14758 Raise an exception and return -1 on error. */
14759static int
14760unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14761 struct unicode_format_arg_t *arg)
14762{
14763#define FORMAT_READ(ctx) \
14764 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14765
14766 PyObject *v;
14767
Victor Stinnera47082312012-10-04 02:19:54 +020014768 if (arg->ch == '(') {
14769 /* Get argument value from a dictionary. Example: "%(name)s". */
14770 Py_ssize_t keystart;
14771 Py_ssize_t keylen;
14772 PyObject *key;
14773 int pcount = 1;
14774
14775 if (ctx->dict == NULL) {
14776 PyErr_SetString(PyExc_TypeError,
14777 "format requires a mapping");
14778 return -1;
14779 }
14780 ++ctx->fmtpos;
14781 --ctx->fmtcnt;
14782 keystart = ctx->fmtpos;
14783 /* Skip over balanced parentheses */
14784 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14785 arg->ch = FORMAT_READ(ctx);
14786 if (arg->ch == ')')
14787 --pcount;
14788 else if (arg->ch == '(')
14789 ++pcount;
14790 ctx->fmtpos++;
14791 }
14792 keylen = ctx->fmtpos - keystart - 1;
14793 if (ctx->fmtcnt < 0 || pcount > 0) {
14794 PyErr_SetString(PyExc_ValueError,
14795 "incomplete format key");
14796 return -1;
14797 }
14798 key = PyUnicode_Substring(ctx->fmtstr,
14799 keystart, keystart + keylen);
14800 if (key == NULL)
14801 return -1;
14802 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014803 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014804 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014805 }
14806 ctx->args = PyObject_GetItem(ctx->dict, key);
14807 Py_DECREF(key);
14808 if (ctx->args == NULL)
14809 return -1;
14810 ctx->args_owned = 1;
14811 ctx->arglen = -1;
14812 ctx->argidx = -2;
14813 }
14814
14815 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014816 while (--ctx->fmtcnt >= 0) {
14817 arg->ch = FORMAT_READ(ctx);
14818 ctx->fmtpos++;
14819 switch (arg->ch) {
14820 case '-': arg->flags |= F_LJUST; continue;
14821 case '+': arg->flags |= F_SIGN; continue;
14822 case ' ': arg->flags |= F_BLANK; continue;
14823 case '#': arg->flags |= F_ALT; continue;
14824 case '0': arg->flags |= F_ZERO; continue;
14825 }
14826 break;
14827 }
14828
14829 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014830 if (arg->ch == '*') {
14831 v = unicode_format_getnextarg(ctx);
14832 if (v == NULL)
14833 return -1;
14834 if (!PyLong_Check(v)) {
14835 PyErr_SetString(PyExc_TypeError,
14836 "* wants int");
14837 return -1;
14838 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014839 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014840 if (arg->width == -1 && PyErr_Occurred())
14841 return -1;
14842 if (arg->width < 0) {
14843 arg->flags |= F_LJUST;
14844 arg->width = -arg->width;
14845 }
14846 if (--ctx->fmtcnt >= 0) {
14847 arg->ch = FORMAT_READ(ctx);
14848 ctx->fmtpos++;
14849 }
14850 }
14851 else if (arg->ch >= '0' && arg->ch <= '9') {
14852 arg->width = arg->ch - '0';
14853 while (--ctx->fmtcnt >= 0) {
14854 arg->ch = FORMAT_READ(ctx);
14855 ctx->fmtpos++;
14856 if (arg->ch < '0' || arg->ch > '9')
14857 break;
14858 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14859 mixing signed and unsigned comparison. Since arg->ch is between
14860 '0' and '9', casting to int is safe. */
14861 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14862 PyErr_SetString(PyExc_ValueError,
14863 "width too big");
14864 return -1;
14865 }
14866 arg->width = arg->width*10 + (arg->ch - '0');
14867 }
14868 }
14869
14870 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014871 if (arg->ch == '.') {
14872 arg->prec = 0;
14873 if (--ctx->fmtcnt >= 0) {
14874 arg->ch = FORMAT_READ(ctx);
14875 ctx->fmtpos++;
14876 }
14877 if (arg->ch == '*') {
14878 v = unicode_format_getnextarg(ctx);
14879 if (v == NULL)
14880 return -1;
14881 if (!PyLong_Check(v)) {
14882 PyErr_SetString(PyExc_TypeError,
14883 "* wants int");
14884 return -1;
14885 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014886 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014887 if (arg->prec == -1 && PyErr_Occurred())
14888 return -1;
14889 if (arg->prec < 0)
14890 arg->prec = 0;
14891 if (--ctx->fmtcnt >= 0) {
14892 arg->ch = FORMAT_READ(ctx);
14893 ctx->fmtpos++;
14894 }
14895 }
14896 else if (arg->ch >= '0' && arg->ch <= '9') {
14897 arg->prec = arg->ch - '0';
14898 while (--ctx->fmtcnt >= 0) {
14899 arg->ch = FORMAT_READ(ctx);
14900 ctx->fmtpos++;
14901 if (arg->ch < '0' || arg->ch > '9')
14902 break;
14903 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14904 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014905 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014906 return -1;
14907 }
14908 arg->prec = arg->prec*10 + (arg->ch - '0');
14909 }
14910 }
14911 }
14912
14913 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14914 if (ctx->fmtcnt >= 0) {
14915 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14916 if (--ctx->fmtcnt >= 0) {
14917 arg->ch = FORMAT_READ(ctx);
14918 ctx->fmtpos++;
14919 }
14920 }
14921 }
14922 if (ctx->fmtcnt < 0) {
14923 PyErr_SetString(PyExc_ValueError,
14924 "incomplete format");
14925 return -1;
14926 }
14927 return 0;
14928
14929#undef FORMAT_READ
14930}
14931
14932/* Format one argument. Supported conversion specifiers:
14933
14934 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014935 - "i", "d", "u": int or float
14936 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014937 - "e", "E", "f", "F", "g", "G": float
14938 - "c": int or str (1 character)
14939
Victor Stinner8dbd4212012-12-04 09:30:24 +010014940 When possible, the output is written directly into the Unicode writer
14941 (ctx->writer). A string is created when padding is required.
14942
Victor Stinnera47082312012-10-04 02:19:54 +020014943 Return 0 if the argument has been formatted into *p_str,
14944 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014945 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014946static int
14947unicode_format_arg_format(struct unicode_formatter_t *ctx,
14948 struct unicode_format_arg_t *arg,
14949 PyObject **p_str)
14950{
14951 PyObject *v;
14952 _PyUnicodeWriter *writer = &ctx->writer;
14953
14954 if (ctx->fmtcnt == 0)
14955 ctx->writer.overallocate = 0;
14956
Victor Stinnera47082312012-10-04 02:19:54 +020014957 v = unicode_format_getnextarg(ctx);
14958 if (v == NULL)
14959 return -1;
14960
Victor Stinnera47082312012-10-04 02:19:54 +020014961
14962 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014963 case 's':
14964 case 'r':
14965 case 'a':
14966 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14967 /* Fast path */
14968 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14969 return -1;
14970 return 1;
14971 }
14972
14973 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14974 *p_str = v;
14975 Py_INCREF(*p_str);
14976 }
14977 else {
14978 if (arg->ch == 's')
14979 *p_str = PyObject_Str(v);
14980 else if (arg->ch == 'r')
14981 *p_str = PyObject_Repr(v);
14982 else
14983 *p_str = PyObject_ASCII(v);
14984 }
14985 break;
14986
14987 case 'i':
14988 case 'd':
14989 case 'u':
14990 case 'o':
14991 case 'x':
14992 case 'X':
14993 {
14994 int ret = mainformatlong(v, arg, p_str, writer);
14995 if (ret != 0)
14996 return ret;
14997 arg->sign = 1;
14998 break;
14999 }
15000
15001 case 'e':
15002 case 'E':
15003 case 'f':
15004 case 'F':
15005 case 'g':
15006 case 'G':
15007 if (arg->width == -1 && arg->prec == -1
15008 && !(arg->flags & (F_SIGN | F_BLANK)))
15009 {
15010 /* Fast path */
15011 if (formatfloat(v, arg, NULL, writer) == -1)
15012 return -1;
15013 return 1;
15014 }
15015
15016 arg->sign = 1;
15017 if (formatfloat(v, arg, p_str, NULL) == -1)
15018 return -1;
15019 break;
15020
15021 case 'c':
15022 {
15023 Py_UCS4 ch = formatchar(v);
15024 if (ch == (Py_UCS4) -1)
15025 return -1;
15026 if (arg->width == -1 && arg->prec == -1) {
15027 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015028 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015029 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015030 return 1;
15031 }
15032 *p_str = PyUnicode_FromOrdinal(ch);
15033 break;
15034 }
15035
15036 default:
15037 PyErr_Format(PyExc_ValueError,
15038 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015039 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015040 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15041 (int)arg->ch,
15042 ctx->fmtpos - 1);
15043 return -1;
15044 }
15045 if (*p_str == NULL)
15046 return -1;
15047 assert (PyUnicode_Check(*p_str));
15048 return 0;
15049}
15050
15051static int
15052unicode_format_arg_output(struct unicode_formatter_t *ctx,
15053 struct unicode_format_arg_t *arg,
15054 PyObject *str)
15055{
15056 Py_ssize_t len;
15057 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015058 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015059 Py_ssize_t pindex;
15060 Py_UCS4 signchar;
15061 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015062 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015063 Py_ssize_t sublen;
15064 _PyUnicodeWriter *writer = &ctx->writer;
15065 Py_UCS4 fill;
15066
15067 fill = ' ';
15068 if (arg->sign && arg->flags & F_ZERO)
15069 fill = '0';
15070
15071 if (PyUnicode_READY(str) == -1)
15072 return -1;
15073
15074 len = PyUnicode_GET_LENGTH(str);
15075 if ((arg->width == -1 || arg->width <= len)
15076 && (arg->prec == -1 || arg->prec >= len)
15077 && !(arg->flags & (F_SIGN | F_BLANK)))
15078 {
15079 /* Fast path */
15080 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15081 return -1;
15082 return 0;
15083 }
15084
15085 /* Truncate the string for "s", "r" and "a" formats
15086 if the precision is set */
15087 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15088 if (arg->prec >= 0 && len > arg->prec)
15089 len = arg->prec;
15090 }
15091
15092 /* Adjust sign and width */
15093 kind = PyUnicode_KIND(str);
15094 pbuf = PyUnicode_DATA(str);
15095 pindex = 0;
15096 signchar = '\0';
15097 if (arg->sign) {
15098 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15099 if (ch == '-' || ch == '+') {
15100 signchar = ch;
15101 len--;
15102 pindex++;
15103 }
15104 else if (arg->flags & F_SIGN)
15105 signchar = '+';
15106 else if (arg->flags & F_BLANK)
15107 signchar = ' ';
15108 else
15109 arg->sign = 0;
15110 }
15111 if (arg->width < len)
15112 arg->width = len;
15113
15114 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015115 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015116 if (!(arg->flags & F_LJUST)) {
15117 if (arg->sign) {
15118 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015119 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015120 }
15121 else {
15122 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015123 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015124 }
15125 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015126 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15127 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015128 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015129 }
15130
Victor Stinnera47082312012-10-04 02:19:54 +020015131 buflen = arg->width;
15132 if (arg->sign && len == arg->width)
15133 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015134 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015135 return -1;
15136
15137 /* Write the sign if needed */
15138 if (arg->sign) {
15139 if (fill != ' ') {
15140 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15141 writer->pos += 1;
15142 }
15143 if (arg->width > len)
15144 arg->width--;
15145 }
15146
15147 /* Write the numeric prefix for "x", "X" and "o" formats
15148 if the alternate form is used.
15149 For example, write "0x" for the "%#x" format. */
15150 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15151 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15152 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15153 if (fill != ' ') {
15154 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15155 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15156 writer->pos += 2;
15157 pindex += 2;
15158 }
15159 arg->width -= 2;
15160 if (arg->width < 0)
15161 arg->width = 0;
15162 len -= 2;
15163 }
15164
15165 /* Pad left with the fill character if needed */
15166 if (arg->width > len && !(arg->flags & F_LJUST)) {
15167 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015168 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015169 writer->pos += sublen;
15170 arg->width = len;
15171 }
15172
15173 /* If padding with spaces: write sign if needed and/or numeric prefix if
15174 the alternate form is used */
15175 if (fill == ' ') {
15176 if (arg->sign) {
15177 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15178 writer->pos += 1;
15179 }
15180 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15181 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15182 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15183 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15184 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15185 writer->pos += 2;
15186 pindex += 2;
15187 }
15188 }
15189
15190 /* Write characters */
15191 if (len) {
15192 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15193 str, pindex, len);
15194 writer->pos += len;
15195 }
15196
15197 /* Pad right with the fill character if needed */
15198 if (arg->width > len) {
15199 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015200 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015201 writer->pos += sublen;
15202 }
15203 return 0;
15204}
15205
15206/* Helper of PyUnicode_Format(): format one arg.
15207 Return 0 on success, raise an exception and return -1 on error. */
15208static int
15209unicode_format_arg(struct unicode_formatter_t *ctx)
15210{
15211 struct unicode_format_arg_t arg;
15212 PyObject *str;
15213 int ret;
15214
Victor Stinner8dbd4212012-12-04 09:30:24 +010015215 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015216 if (arg.ch == '%') {
15217 ctx->fmtpos++;
15218 ctx->fmtcnt--;
15219 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15220 return -1;
15221 return 0;
15222 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015223 arg.flags = 0;
15224 arg.width = -1;
15225 arg.prec = -1;
15226 arg.sign = 0;
15227 str = NULL;
15228
Victor Stinnera47082312012-10-04 02:19:54 +020015229 ret = unicode_format_arg_parse(ctx, &arg);
15230 if (ret == -1)
15231 return -1;
15232
15233 ret = unicode_format_arg_format(ctx, &arg, &str);
15234 if (ret == -1)
15235 return -1;
15236
15237 if (ret != 1) {
15238 ret = unicode_format_arg_output(ctx, &arg, str);
15239 Py_DECREF(str);
15240 if (ret == -1)
15241 return -1;
15242 }
15243
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015244 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015245 PyErr_SetString(PyExc_TypeError,
15246 "not all arguments converted during string formatting");
15247 return -1;
15248 }
15249 return 0;
15250}
15251
Alexander Belopolsky40018472011-02-26 01:02:56 +000015252PyObject *
15253PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015254{
Victor Stinnera47082312012-10-04 02:19:54 +020015255 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015256
Guido van Rossumd57fd912000-03-10 22:53:23 +000015257 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015258 PyErr_BadInternalCall();
15259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015260 }
Victor Stinnera47082312012-10-04 02:19:54 +020015261
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015262 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015263 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015264
15265 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015266 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15267 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15268 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15269 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015270
Victor Stinner8f674cc2013-04-17 23:02:17 +020015271 _PyUnicodeWriter_Init(&ctx.writer);
15272 ctx.writer.min_length = ctx.fmtcnt + 100;
15273 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015274
Guido van Rossumd57fd912000-03-10 22:53:23 +000015275 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015276 ctx.arglen = PyTuple_Size(args);
15277 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015278 }
15279 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015280 ctx.arglen = -1;
15281 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015282 }
Victor Stinnera47082312012-10-04 02:19:54 +020015283 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015284 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015285 ctx.dict = args;
15286 else
15287 ctx.dict = NULL;
15288 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015289
Victor Stinnera47082312012-10-04 02:19:54 +020015290 while (--ctx.fmtcnt >= 0) {
15291 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015292 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015293
15294 nonfmtpos = ctx.fmtpos++;
15295 while (ctx.fmtcnt >= 0 &&
15296 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15297 ctx.fmtpos++;
15298 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015299 }
Victor Stinnera47082312012-10-04 02:19:54 +020015300 if (ctx.fmtcnt < 0) {
15301 ctx.fmtpos--;
15302 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015303 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015304
Victor Stinnercfc4c132013-04-03 01:48:39 +020015305 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15306 nonfmtpos, ctx.fmtpos) < 0)
15307 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015308 }
15309 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015310 ctx.fmtpos++;
15311 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015312 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015313 }
15314 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015315
Victor Stinnera47082312012-10-04 02:19:54 +020015316 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015317 PyErr_SetString(PyExc_TypeError,
15318 "not all arguments converted during string formatting");
15319 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015320 }
15321
Victor Stinnera47082312012-10-04 02:19:54 +020015322 if (ctx.args_owned) {
15323 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015324 }
Victor Stinnera47082312012-10-04 02:19:54 +020015325 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015326
Benjamin Peterson29060642009-01-31 22:14:21 +000015327 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015328 _PyUnicodeWriter_Dealloc(&ctx.writer);
15329 if (ctx.args_owned) {
15330 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015331 }
15332 return NULL;
15333}
15334
Jeremy Hylton938ace62002-07-17 16:30:39 +000015335static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015336unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15337
Tim Peters6d6c1a32001-08-02 04:15:00 +000015338static PyObject *
15339unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15340{
Benjamin Peterson29060642009-01-31 22:14:21 +000015341 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015342 static char *kwlist[] = {"object", "encoding", "errors", 0};
15343 char *encoding = NULL;
15344 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015345
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 if (type != &PyUnicode_Type)
15347 return unicode_subtype_new(type, args, kwds);
15348 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015349 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015350 return NULL;
15351 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015352 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015353 if (encoding == NULL && errors == NULL)
15354 return PyObject_Str(x);
15355 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015356 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015357}
15358
Guido van Rossume023fe02001-08-30 03:12:59 +000015359static PyObject *
15360unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15361{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015362 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015363 Py_ssize_t length, char_size;
15364 int share_wstr, share_utf8;
15365 unsigned int kind;
15366 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015367
Benjamin Peterson14339b62009-01-31 16:36:08 +000015368 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015369
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015370 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015371 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015372 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015373 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015374 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015375 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015376 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015377 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015378
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015379 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015380 if (self == NULL) {
15381 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015382 return NULL;
15383 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015384 kind = PyUnicode_KIND(unicode);
15385 length = PyUnicode_GET_LENGTH(unicode);
15386
15387 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015388#ifdef Py_DEBUG
15389 _PyUnicode_HASH(self) = -1;
15390#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015391 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015392#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015393 _PyUnicode_STATE(self).interned = 0;
15394 _PyUnicode_STATE(self).kind = kind;
15395 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015396 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015397 _PyUnicode_STATE(self).ready = 1;
15398 _PyUnicode_WSTR(self) = NULL;
15399 _PyUnicode_UTF8_LENGTH(self) = 0;
15400 _PyUnicode_UTF8(self) = NULL;
15401 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015402 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015403
15404 share_utf8 = 0;
15405 share_wstr = 0;
15406 if (kind == PyUnicode_1BYTE_KIND) {
15407 char_size = 1;
15408 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15409 share_utf8 = 1;
15410 }
15411 else if (kind == PyUnicode_2BYTE_KIND) {
15412 char_size = 2;
15413 if (sizeof(wchar_t) == 2)
15414 share_wstr = 1;
15415 }
15416 else {
15417 assert(kind == PyUnicode_4BYTE_KIND);
15418 char_size = 4;
15419 if (sizeof(wchar_t) == 4)
15420 share_wstr = 1;
15421 }
15422
15423 /* Ensure we won't overflow the length. */
15424 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15425 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015426 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015427 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015428 data = PyObject_MALLOC((length + 1) * char_size);
15429 if (data == NULL) {
15430 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015431 goto onError;
15432 }
15433
Victor Stinnerc3c74152011-10-02 20:39:55 +020015434 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015435 if (share_utf8) {
15436 _PyUnicode_UTF8_LENGTH(self) = length;
15437 _PyUnicode_UTF8(self) = data;
15438 }
15439 if (share_wstr) {
15440 _PyUnicode_WSTR_LENGTH(self) = length;
15441 _PyUnicode_WSTR(self) = (wchar_t *)data;
15442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015443
Christian Heimesf051e432016-09-13 20:22:02 +020015444 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015445 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015446 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015447#ifdef Py_DEBUG
15448 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15449#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015450 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015451 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015452
15453onError:
15454 Py_DECREF(unicode);
15455 Py_DECREF(self);
15456 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015457}
15458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015459PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015460"str(object='') -> str\n\
15461str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015462\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015463Create a new string object from the given object. If encoding or\n\
15464errors is specified, then the object must expose a data buffer\n\
15465that will be decoded using the given encoding and error handler.\n\
15466Otherwise, returns the result of object.__str__() (if defined)\n\
15467or repr(object).\n\
15468encoding defaults to sys.getdefaultencoding().\n\
15469errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015470
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015471static PyObject *unicode_iter(PyObject *seq);
15472
Guido van Rossumd57fd912000-03-10 22:53:23 +000015473PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015474 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015475 "str", /* tp_name */
15476 sizeof(PyUnicodeObject), /* tp_basicsize */
15477 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015478 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015479 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015480 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015481 0, /* tp_getattr */
15482 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015483 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015484 unicode_repr, /* tp_repr */
15485 &unicode_as_number, /* tp_as_number */
15486 &unicode_as_sequence, /* tp_as_sequence */
15487 &unicode_as_mapping, /* tp_as_mapping */
15488 (hashfunc) unicode_hash, /* tp_hash*/
15489 0, /* tp_call*/
15490 (reprfunc) unicode_str, /* tp_str */
15491 PyObject_GenericGetAttr, /* tp_getattro */
15492 0, /* tp_setattro */
15493 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015494 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015495 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15496 unicode_doc, /* tp_doc */
15497 0, /* tp_traverse */
15498 0, /* tp_clear */
15499 PyUnicode_RichCompare, /* tp_richcompare */
15500 0, /* tp_weaklistoffset */
15501 unicode_iter, /* tp_iter */
15502 0, /* tp_iternext */
15503 unicode_methods, /* tp_methods */
15504 0, /* tp_members */
15505 0, /* tp_getset */
15506 &PyBaseObject_Type, /* tp_base */
15507 0, /* tp_dict */
15508 0, /* tp_descr_get */
15509 0, /* tp_descr_set */
15510 0, /* tp_dictoffset */
15511 0, /* tp_init */
15512 0, /* tp_alloc */
15513 unicode_new, /* tp_new */
15514 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015515};
15516
15517/* Initialize the Unicode implementation */
15518
Victor Stinner331a6a52019-05-27 16:39:22 +020015519PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015520_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015521{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015522 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015523 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015524 0x000A, /* LINE FEED */
15525 0x000D, /* CARRIAGE RETURN */
15526 0x001C, /* FILE SEPARATOR */
15527 0x001D, /* GROUP SEPARATOR */
15528 0x001E, /* RECORD SEPARATOR */
15529 0x0085, /* NEXT LINE */
15530 0x2028, /* LINE SEPARATOR */
15531 0x2029, /* PARAGRAPH SEPARATOR */
15532 };
15533
Fred Drakee4315f52000-05-09 19:53:39 +000015534 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015535 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015536 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015537 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015538 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015539 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015540
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015541 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015542 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015543 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015544
15545 /* initialize the linebreak bloom filter */
15546 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015547 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015548 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015549
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015550 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015551 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015552 }
15553 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015554 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015555 }
15556 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015557 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015558 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015559 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015560}
15561
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015562
Walter Dörwald16807132007-05-25 13:52:07 +000015563void
15564PyUnicode_InternInPlace(PyObject **p)
15565{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015566 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015567#ifdef Py_DEBUG
15568 assert(s != NULL);
15569 assert(_PyUnicode_CHECK(s));
15570#else
Victor Stinner607b1022020-05-05 18:50:30 +020015571 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015572 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015573 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015574#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015575
Benjamin Peterson14339b62009-01-31 16:36:08 +000015576 /* If it's a subclass, we don't really know what putting
15577 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015578 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015579 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015580 }
15581
15582 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015583 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015584 }
15585
15586#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015587 if (interned == NULL) {
15588 interned = PyDict_New();
15589 if (interned == NULL) {
15590 PyErr_Clear(); /* Don't leave an exception */
15591 return;
15592 }
15593 }
Victor Stinner607b1022020-05-05 18:50:30 +020015594
15595 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015596 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015597 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015598 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015599
Berker Peksagced8d4c2016-07-25 04:40:39 +030015600 if (t == NULL) {
15601 PyErr_Clear();
15602 return;
15603 }
Victor Stinner607b1022020-05-05 18:50:30 +020015604
Berker Peksagced8d4c2016-07-25 04:40:39 +030015605 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015606 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015607 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015608 return;
15609 }
Victor Stinner607b1022020-05-05 18:50:30 +020015610
Benjamin Peterson14339b62009-01-31 16:36:08 +000015611 /* The two references in interned are not counted by refcnt.
15612 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015613 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015614 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015615#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015616}
15617
15618void
15619PyUnicode_InternImmortal(PyObject **p)
15620{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015621 PyUnicode_InternInPlace(p);
15622 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015623 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015624 Py_INCREF(*p);
15625 }
Walter Dörwald16807132007-05-25 13:52:07 +000015626}
15627
15628PyObject *
15629PyUnicode_InternFromString(const char *cp)
15630{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015631 PyObject *s = PyUnicode_FromString(cp);
15632 if (s == NULL)
15633 return NULL;
15634 PyUnicode_InternInPlace(&s);
15635 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015636}
15637
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015638
15639#if defined(WITH_VALGRIND) || defined(__INSURE__)
15640static void
15641unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015642{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015643 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015644 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015645 }
15646 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015647 if (keys == NULL || !PyList_Check(keys)) {
15648 PyErr_Clear();
15649 return;
15650 }
Walter Dörwald16807132007-05-25 13:52:07 +000015651
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015652 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015653 detector, interned unicode strings are not forcibly deallocated;
15654 rather, we give them their stolen references back, and then clear
15655 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015656
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015657 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015658#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015659 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015660
15661 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015662#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015663 for (Py_ssize_t i = 0; i < n; i++) {
15664 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015665 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015666 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015667 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015668 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015669 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015670 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015671#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015672 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015673#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015674 break;
15675 case SSTATE_INTERNED_MORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015676 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015677#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015678 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015679#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015680 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015681 case SSTATE_NOT_INTERNED:
15682 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015683 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015684 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015686 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015687 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015688#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015689 fprintf(stderr,
15690 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15691 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015692#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015693 Py_DECREF(keys);
15694 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015695 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015696}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015697#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015698
15699
15700/********************* Unicode Iterator **************************/
15701
15702typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015703 PyObject_HEAD
15704 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015705 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015706} unicodeiterobject;
15707
15708static void
15709unicodeiter_dealloc(unicodeiterobject *it)
15710{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015711 _PyObject_GC_UNTRACK(it);
15712 Py_XDECREF(it->it_seq);
15713 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015714}
15715
15716static int
15717unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15718{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015719 Py_VISIT(it->it_seq);
15720 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015721}
15722
15723static PyObject *
15724unicodeiter_next(unicodeiterobject *it)
15725{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015726 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015727
Benjamin Peterson14339b62009-01-31 16:36:08 +000015728 assert(it != NULL);
15729 seq = it->it_seq;
15730 if (seq == NULL)
15731 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015732 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015734 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15735 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015736 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015737 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15738 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015739 if (item != NULL)
15740 ++it->it_index;
15741 return item;
15742 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015743
Benjamin Peterson14339b62009-01-31 16:36:08 +000015744 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015745 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015746 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015747}
15748
15749static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015750unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015751{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015752 Py_ssize_t len = 0;
15753 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015754 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015755 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015756}
15757
15758PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15759
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015760static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015761unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015762{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015763 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015764 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015765 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015766 it->it_seq, it->it_index);
15767 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015768 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015769 if (u == NULL)
15770 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015771 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015772 }
15773}
15774
15775PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15776
15777static PyObject *
15778unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15779{
15780 Py_ssize_t index = PyLong_AsSsize_t(state);
15781 if (index == -1 && PyErr_Occurred())
15782 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015783 if (it->it_seq != NULL) {
15784 if (index < 0)
15785 index = 0;
15786 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15787 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15788 it->it_index = index;
15789 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015790 Py_RETURN_NONE;
15791}
15792
15793PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15794
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015795static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015796 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015797 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015798 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15799 reduce_doc},
15800 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15801 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015802 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015803};
15804
15805PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015806 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15807 "str_iterator", /* tp_name */
15808 sizeof(unicodeiterobject), /* tp_basicsize */
15809 0, /* tp_itemsize */
15810 /* methods */
15811 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015812 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015813 0, /* tp_getattr */
15814 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015815 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015816 0, /* tp_repr */
15817 0, /* tp_as_number */
15818 0, /* tp_as_sequence */
15819 0, /* tp_as_mapping */
15820 0, /* tp_hash */
15821 0, /* tp_call */
15822 0, /* tp_str */
15823 PyObject_GenericGetAttr, /* tp_getattro */
15824 0, /* tp_setattro */
15825 0, /* tp_as_buffer */
15826 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15827 0, /* tp_doc */
15828 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15829 0, /* tp_clear */
15830 0, /* tp_richcompare */
15831 0, /* tp_weaklistoffset */
15832 PyObject_SelfIter, /* tp_iter */
15833 (iternextfunc)unicodeiter_next, /* tp_iternext */
15834 unicodeiter_methods, /* tp_methods */
15835 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015836};
15837
15838static PyObject *
15839unicode_iter(PyObject *seq)
15840{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015841 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015842
Benjamin Peterson14339b62009-01-31 16:36:08 +000015843 if (!PyUnicode_Check(seq)) {
15844 PyErr_BadInternalCall();
15845 return NULL;
15846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015847 if (PyUnicode_READY(seq) == -1)
15848 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015849 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15850 if (it == NULL)
15851 return NULL;
15852 it->it_index = 0;
15853 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015854 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015855 _PyObject_GC_TRACK(it);
15856 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015857}
15858
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015859
15860size_t
15861Py_UNICODE_strlen(const Py_UNICODE *u)
15862{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015863 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015864}
15865
15866Py_UNICODE*
15867Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15868{
15869 Py_UNICODE *u = s1;
15870 while ((*u++ = *s2++));
15871 return s1;
15872}
15873
15874Py_UNICODE*
15875Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15876{
15877 Py_UNICODE *u = s1;
15878 while ((*u++ = *s2++))
15879 if (n-- == 0)
15880 break;
15881 return s1;
15882}
15883
15884Py_UNICODE*
15885Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15886{
15887 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015888 u1 += wcslen(u1);
15889 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015890 return s1;
15891}
15892
15893int
15894Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15895{
15896 while (*s1 && *s2 && *s1 == *s2)
15897 s1++, s2++;
15898 if (*s1 && *s2)
15899 return (*s1 < *s2) ? -1 : +1;
15900 if (*s1)
15901 return 1;
15902 if (*s2)
15903 return -1;
15904 return 0;
15905}
15906
15907int
15908Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15909{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015910 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015911 for (; n != 0; n--) {
15912 u1 = *s1;
15913 u2 = *s2;
15914 if (u1 != u2)
15915 return (u1 < u2) ? -1 : +1;
15916 if (u1 == '\0')
15917 return 0;
15918 s1++;
15919 s2++;
15920 }
15921 return 0;
15922}
15923
15924Py_UNICODE*
15925Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15926{
15927 const Py_UNICODE *p;
15928 for (p = s; *p; p++)
15929 if (*p == c)
15930 return (Py_UNICODE*)p;
15931 return NULL;
15932}
15933
15934Py_UNICODE*
15935Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15936{
15937 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015938 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015939 while (p != s) {
15940 p--;
15941 if (*p == c)
15942 return (Py_UNICODE*)p;
15943 }
15944 return NULL;
15945}
Victor Stinner331ea922010-08-10 16:37:20 +000015946
Victor Stinner71133ff2010-09-01 23:43:53 +000015947Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015948PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015949{
Victor Stinner577db2c2011-10-11 22:12:48 +020015950 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015951 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015953 if (!PyUnicode_Check(unicode)) {
15954 PyErr_BadArgument();
15955 return NULL;
15956 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015957 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015958 if (u == NULL)
15959 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015960 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015961 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015962 PyErr_NoMemory();
15963 return NULL;
15964 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015965 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015966 size *= sizeof(Py_UNICODE);
15967 copy = PyMem_Malloc(size);
15968 if (copy == NULL) {
15969 PyErr_NoMemory();
15970 return NULL;
15971 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015972 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015973 return copy;
15974}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015975
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015976
Victor Stinner709d23d2019-05-02 14:56:30 -040015977static int
15978encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015979{
Victor Stinner709d23d2019-05-02 14:56:30 -040015980 int res;
15981 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15982 if (res == -2) {
15983 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15984 return -1;
15985 }
15986 if (res < 0) {
15987 PyErr_NoMemory();
15988 return -1;
15989 }
15990 return 0;
15991}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015992
Victor Stinner709d23d2019-05-02 14:56:30 -040015993
15994static int
15995config_get_codec_name(wchar_t **config_encoding)
15996{
15997 char *encoding;
15998 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15999 return -1;
16000 }
16001
16002 PyObject *name_obj = NULL;
16003 PyObject *codec = _PyCodec_Lookup(encoding);
16004 PyMem_RawFree(encoding);
16005
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016006 if (!codec)
16007 goto error;
16008
16009 name_obj = PyObject_GetAttrString(codec, "name");
16010 Py_CLEAR(codec);
16011 if (!name_obj) {
16012 goto error;
16013 }
16014
Victor Stinner709d23d2019-05-02 14:56:30 -040016015 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16016 Py_DECREF(name_obj);
16017 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016018 goto error;
16019 }
16020
Victor Stinner709d23d2019-05-02 14:56:30 -040016021 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16022 if (raw_wname == NULL) {
16023 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016024 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016025 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016026 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016027
16028 PyMem_RawFree(*config_encoding);
16029 *config_encoding = raw_wname;
16030
16031 PyMem_Free(wname);
16032 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016033
16034error:
16035 Py_XDECREF(codec);
16036 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016037 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016038}
16039
16040
Victor Stinner331a6a52019-05-27 16:39:22 +020016041static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016042init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016043{
Victor Stinner709d23d2019-05-02 14:56:30 -040016044 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016045 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016046 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016047 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016048 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016049 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016050 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016051}
16052
16053
Victor Stinner709d23d2019-05-02 14:56:30 -040016054static int
16055init_fs_codec(PyInterpreterState *interp)
16056{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016057 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016058
16059 _Py_error_handler error_handler;
16060 error_handler = get_error_handler_wide(config->filesystem_errors);
16061 if (error_handler == _Py_ERROR_UNKNOWN) {
16062 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16063 return -1;
16064 }
16065
16066 char *encoding, *errors;
16067 if (encode_wstr_utf8(config->filesystem_encoding,
16068 &encoding,
16069 "filesystem_encoding") < 0) {
16070 return -1;
16071 }
16072
16073 if (encode_wstr_utf8(config->filesystem_errors,
16074 &errors,
16075 "filesystem_errors") < 0) {
16076 PyMem_RawFree(encoding);
16077 return -1;
16078 }
16079
Victor Stinner3d17c042020-05-14 01:48:38 +020016080 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16081 PyMem_RawFree(fs_codec->encoding);
16082 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016083 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016084 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16085 PyMem_RawFree(fs_codec->errors);
16086 fs_codec->errors = errors;
16087 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016088
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016089#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016090 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016091#endif
16092
Victor Stinner709d23d2019-05-02 14:56:30 -040016093 /* At this point, PyUnicode_EncodeFSDefault() and
16094 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16095 the C implementation of the filesystem encoding. */
16096
16097 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16098 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016099 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16100 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016101 PyErr_NoMemory();
16102 return -1;
16103 }
16104 return 0;
16105}
16106
16107
Victor Stinner331a6a52019-05-27 16:39:22 +020016108static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016109init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016110{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016111 PyInterpreterState *interp = tstate->interp;
16112
Victor Stinner709d23d2019-05-02 14:56:30 -040016113 /* Update the filesystem encoding to the normalized Python codec name.
16114 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16115 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016116 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016117 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016118 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016119 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016120 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016121 }
16122
Victor Stinner709d23d2019-05-02 14:56:30 -040016123 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016124 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016125 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016126 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016127}
16128
16129
Victor Stinner331a6a52019-05-27 16:39:22 +020016130PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016131_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016132{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016133 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016134 if (_PyStatus_EXCEPTION(status)) {
16135 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016136 }
16137
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016138 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016139}
16140
16141
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016142static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016143_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016144{
Victor Stinner3d17c042020-05-14 01:48:38 +020016145 PyMem_RawFree(fs_codec->encoding);
16146 fs_codec->encoding = NULL;
16147 fs_codec->utf8 = 0;
16148 PyMem_RawFree(fs_codec->errors);
16149 fs_codec->errors = NULL;
16150 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016151}
16152
16153
Victor Stinner709d23d2019-05-02 14:56:30 -040016154#ifdef MS_WINDOWS
16155int
16156_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16157{
Victor Stinner81a7be32020-04-14 15:14:01 +020016158 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016159 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016160
16161 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16162 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16163 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16164 if (encoding == NULL || errors == NULL) {
16165 PyMem_RawFree(encoding);
16166 PyMem_RawFree(errors);
16167 PyErr_NoMemory();
16168 return -1;
16169 }
16170
16171 PyMem_RawFree(config->filesystem_encoding);
16172 config->filesystem_encoding = encoding;
16173 PyMem_RawFree(config->filesystem_errors);
16174 config->filesystem_errors = errors;
16175
16176 return init_fs_codec(interp);
16177}
16178#endif
16179
16180
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016181void
Victor Stinner3d483342019-11-22 12:27:50 +010016182_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016183{
Victor Stinner3d483342019-11-22 12:27:50 +010016184 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016185#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016186 /* Insure++ is a memory analysis tool that aids in discovering
16187 * memory leaks and other memory problems. On Python exit, the
16188 * interned string dictionaries are flagged as being in use at exit
16189 * (which it is). Under normal circumstances, this is fine because
16190 * the memory will be automatically reclaimed by the system. Under
16191 * memory debugging, it's a huge source of useless noise, so we
16192 * trade off slower shutdown for less distraction in the memory
16193 * reports. -baw
16194 */
16195 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016196#endif /* __INSURE__ */
16197
Victor Stinner3d483342019-11-22 12:27:50 +010016198 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016199
Victor Stinner607b1022020-05-05 18:50:30 +020016200#ifdef LATIN1_SINGLETONS
Victor Stinner3d483342019-11-22 12:27:50 +010016201 for (Py_ssize_t i = 0; i < 256; i++) {
16202 Py_CLEAR(unicode_latin1[i]);
16203 }
Victor Stinner607b1022020-05-05 18:50:30 +020016204#endif
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016205 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016206 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016207
Victor Stinner3d17c042020-05-14 01:48:38 +020016208 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016209}
16210
16211
Georg Brandl66c221e2010-10-14 07:04:07 +000016212/* A _string module, to export formatter_parser and formatter_field_name_split
16213 to the string.Formatter class implemented in Python. */
16214
16215static PyMethodDef _string_methods[] = {
16216 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16217 METH_O, PyDoc_STR("split the argument as a field name")},
16218 {"formatter_parser", (PyCFunction) formatter_parser,
16219 METH_O, PyDoc_STR("parse the argument as a format string")},
16220 {NULL, NULL}
16221};
16222
16223static struct PyModuleDef _string_module = {
16224 PyModuleDef_HEAD_INIT,
16225 "_string",
16226 PyDoc_STR("string helper module"),
16227 0,
16228 _string_methods,
16229 NULL,
16230 NULL,
16231 NULL,
16232 NULL
16233};
16234
16235PyMODINIT_FUNC
16236PyInit__string(void)
16237{
16238 return PyModule_Create(&_string_module);
16239}
16240
16241
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016242#ifdef __cplusplus
16243}
16244#endif