blob: 3c2383d57c863e0b79e81e4757be03feb8414f97 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020047#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinnerbcda8f12018-11-21 22:27:47 +010048#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020049#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040050#include "pycore_pylifecycle.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020051#include "pycore_pystate.h" // _PyInterpreterState_GET()
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000052#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070053#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000055#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000056#include <windows.h>
57#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000058
Victor Stinnerfecc4f22019-03-19 14:20:29 +010059/* Uncomment to display statistics on interned strings at exit when
60 using Valgrind or Insecure++. */
61/* #define INTERNED_STATS 1 */
62
63
Larry Hastings61272b72014-01-07 12:41:53 -080064/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090065class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080066[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090067/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69/*[python input]
70class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80[python start generated code]*/
81/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080082
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchaka05997252013-01-26 12:14:02 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner8faf8212011-12-08 22:14:11 +010096/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97#define MAX_UNICODE 0x10ffff
98
Victor Stinner910337b2011-10-03 03:20:16 +020099#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200100# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#else
102# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200104
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105#define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200108 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200113#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200116 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200121#define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
Inada Naoki610a60c2020-06-18 17:30:53 +0900123
124/* Don't use deprecated macro of unicodeobject.h */
125#undef PyUnicode_WSTR_LENGTH
126#define PyUnicode_WSTR_LENGTH(op) \
127 (PyUnicode_IS_COMPACT_ASCII(op) ? \
128 ((PyASCIIObject*)op)->length : \
129 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_WSTR_LENGTH(op) \
131 (((PyCompactUnicodeObject*)(op))->wstr_length)
132#define _PyUnicode_LENGTH(op) \
133 (((PyASCIIObject *)(op))->length)
134#define _PyUnicode_STATE(op) \
135 (((PyASCIIObject *)(op))->state)
136#define _PyUnicode_HASH(op) \
137 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200138#define _PyUnicode_KIND(op) \
139 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200140 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200141#define _PyUnicode_GET_LENGTH(op) \
142 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200143 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200144#define _PyUnicode_DATA_ANY(op) \
145 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200146
Victor Stinner910337b2011-10-03 03:20:16 +0200147#undef PyUnicode_READY
148#define PyUnicode_READY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200151 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100152 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200153
Victor Stinnerc379ead2011-10-03 12:52:27 +0200154#define _PyUnicode_SHARE_UTF8(op) \
155 (assert(_PyUnicode_CHECK(op)), \
156 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
157 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
158#define _PyUnicode_SHARE_WSTR(op) \
159 (assert(_PyUnicode_CHECK(op)), \
160 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
161
Victor Stinner829c0ad2011-10-03 01:08:02 +0200162/* true if the Unicode object has an allocated UTF-8 memory block
163 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200164#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200165 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
168
Victor Stinner03490912011-10-03 23:45:12 +0200169/* true if the Unicode object has an allocated wstr memory block
170 (not shared with other data) */
171#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200172 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200173 (!PyUnicode_IS_READY(op) || \
174 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
175
Victor Stinner910337b2011-10-03 03:20:16 +0200176/* Generic helper macro to convert characters of different types.
177 from_type and to_type have to be valid type names, begin and end
178 are pointers to the source characters which should be of type
179 "from_type *". to is a pointer of type "to_type *" and points to the
180 buffer where the result characters are written to. */
181#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
182 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100183 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600184 const from_type *_iter = (const from_type *)(begin);\
185 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 Py_ssize_t n = (_end) - (_iter); \
187 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200188 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200189 while (_iter < (_unrolled_end)) { \
190 _to[0] = (to_type) _iter[0]; \
191 _to[1] = (to_type) _iter[1]; \
192 _to[2] = (to_type) _iter[2]; \
193 _to[3] = (to_type) _iter[3]; \
194 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200196 while (_iter < (_end)) \
197 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200198 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200199
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200200#ifdef MS_WINDOWS
201 /* On Windows, overallocate by 50% is the best factor */
202# define OVERALLOCATE_FACTOR 2
203#else
204 /* On Linux, overallocate by 25% is the best factor */
205# define OVERALLOCATE_FACTOR 4
206#endif
207
Victor Stinner9512ad72020-05-20 00:27:46 +0200208#define INTERNED_STRINGS
Victor Stinner607b1022020-05-05 18:50:30 +0200209
Walter Dörwald16807132007-05-25 13:52:07 +0000210/* This dictionary holds all interned unicode strings. Note that references
211 to strings in this dictionary are *not* counted in the string's ob_refcnt.
212 When the interned string reaches a refcnt of 0 the string deallocation
213 function will delete the reference from this dictionary.
214
215 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000216 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000217*/
Victor Stinner607b1022020-05-05 18:50:30 +0200218#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200219static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200220#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000221
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200223static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200224
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200226 do { \
227 if (unicode_empty != NULL) \
228 Py_INCREF(unicode_empty); \
229 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200230 unicode_empty = PyUnicode_New(0, 0); \
231 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200232 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200233 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
234 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200235 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200236 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000237
Serhiy Storchaka678db842013-01-26 12:16:36 +0200238#define _Py_RETURN_UNICODE_EMPTY() \
239 do { \
240 _Py_INCREF_UNICODE_EMPTY(); \
241 return unicode_empty; \
242 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243
Victor Stinner59423e32018-11-26 13:40:01 +0100244static inline void
245unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
246 Py_ssize_t start, Py_ssize_t length)
247{
248 assert(0 <= start);
249 assert(kind != PyUnicode_WCHAR_KIND);
250 switch (kind) {
251 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100252 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100253 Py_UCS1 ch = (unsigned char)value;
254 Py_UCS1 *to = (Py_UCS1 *)data + start;
255 memset(to, ch, length);
256 break;
257 }
258 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100259 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100260 Py_UCS2 ch = (Py_UCS2)value;
261 Py_UCS2 *to = (Py_UCS2 *)data + start;
262 const Py_UCS2 *end = to + length;
263 for (; to < end; ++to) *to = ch;
264 break;
265 }
266 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100267 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100268 Py_UCS4 ch = value;
269 Py_UCS4 * to = (Py_UCS4 *)data + start;
270 const Py_UCS4 *end = to + length;
271 for (; to < end; ++to) *to = ch;
272 break;
273 }
274 default: Py_UNREACHABLE();
275 }
276}
277
278
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200279/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700280static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200281_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900282static inline void
283_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400284static PyObject *
285unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
286 const char *errors);
287static PyObject *
288unicode_decode_utf8(const char *s, Py_ssize_t size,
289 _Py_error_handler error_handler, const char *errors,
290 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200291
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200292/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200293static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200294
Victor Stinner9512ad72020-05-20 00:27:46 +0200295#define LATIN1_SINGLETONS
Victor Stinner607b1022020-05-05 18:50:30 +0200296
297#ifdef LATIN1_SINGLETONS
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000298/* Single character Unicode strings in the Latin-1 range are being
299 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200300static PyObject *unicode_latin1[256] = {NULL};
Victor Stinner607b1022020-05-05 18:50:30 +0200301#endif
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000302
Christian Heimes190d79e2008-01-30 11:58:22 +0000303/* Fast detection of the most frequent whitespace characters */
304const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000305 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000306/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000307/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000308/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000309/* case 0x000C: * FORM FEED */
310/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000311 0, 1, 1, 1, 1, 1, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000313/* case 0x001C: * FILE SEPARATOR */
314/* case 0x001D: * GROUP SEPARATOR */
315/* case 0x001E: * RECORD SEPARATOR */
316/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000317 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000318/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000319 1, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321 0, 0, 0, 0, 0, 0, 0, 0,
322 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000323
Benjamin Peterson14339b62009-01-31 16:36:08 +0000324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000332};
333
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200334/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200335static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200336static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100337static int unicode_modifiable(PyObject *unicode);
338
Victor Stinnerfe226c02011-10-03 03:52:20 +0200339
Alexander Belopolsky40018472011-02-26 01:02:56 +0000340static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100341_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200342static PyObject *
343_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
344static PyObject *
345_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
346
347static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000348unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000349 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100350 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000351 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
352
Alexander Belopolsky40018472011-02-26 01:02:56 +0000353static void
354raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300355 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100356 PyObject *unicode,
357 Py_ssize_t startpos, Py_ssize_t endpos,
358 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000359
Christian Heimes190d79e2008-01-30 11:58:22 +0000360/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200361static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000362 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000363/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000364/* 0x000B, * LINE TABULATION */
365/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000366/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000367 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000368 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000369/* 0x001C, * FILE SEPARATOR */
370/* 0x001D, * GROUP SEPARATOR */
371/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000372 0, 0, 0, 0, 1, 1, 1, 0,
373 0, 0, 0, 0, 0, 0, 0, 0,
374 0, 0, 0, 0, 0, 0, 0, 0,
375 0, 0, 0, 0, 0, 0, 0, 0,
376 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000377
Benjamin Peterson14339b62009-01-31 16:36:08 +0000378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000386};
387
INADA Naoki3ae20562017-01-16 20:41:20 +0900388static int convert_uc(PyObject *obj, void *addr);
389
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300390#include "clinic/unicodeobject.c.h"
391
Victor Stinner3d4226a2018-08-29 22:21:32 +0200392_Py_error_handler
393_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200394{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200397 }
398 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200400 }
401 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200402 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200403 }
404 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200405 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200406 }
407 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200408 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200409 }
410 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200411 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200412 }
413 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200414 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200415 }
Victor Stinner50149202015-09-22 00:26:54 +0200416 return _Py_ERROR_OTHER;
417}
418
Victor Stinner709d23d2019-05-02 14:56:30 -0400419
420static _Py_error_handler
421get_error_handler_wide(const wchar_t *errors)
422{
423 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
424 return _Py_ERROR_STRICT;
425 }
426 if (wcscmp(errors, L"surrogateescape") == 0) {
427 return _Py_ERROR_SURROGATEESCAPE;
428 }
429 if (wcscmp(errors, L"replace") == 0) {
430 return _Py_ERROR_REPLACE;
431 }
432 if (wcscmp(errors, L"ignore") == 0) {
433 return _Py_ERROR_IGNORE;
434 }
435 if (wcscmp(errors, L"backslashreplace") == 0) {
436 return _Py_ERROR_BACKSLASHREPLACE;
437 }
438 if (wcscmp(errors, L"surrogatepass") == 0) {
439 return _Py_ERROR_SURROGATEPASS;
440 }
441 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
442 return _Py_ERROR_XMLCHARREFREPLACE;
443 }
444 return _Py_ERROR_OTHER;
445}
446
447
Victor Stinner22eb6892019-06-26 00:51:05 +0200448static inline int
449unicode_check_encoding_errors(const char *encoding, const char *errors)
450{
451 if (encoding == NULL && errors == NULL) {
452 return 0;
453 }
454
Victor Stinner81a7be32020-04-14 15:14:01 +0200455 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200456#ifndef Py_DEBUG
457 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200458 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200459 return 0;
460 }
461#else
462 /* Always check in debug mode */
463#endif
464
465 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
466 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200467 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200468 return 0;
469 }
470
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200471 /* Disable checks during Python finalization. For example, it allows to
472 call _PyObject_Dump() during finalization for debugging purpose. */
473 if (interp->finalizing) {
474 return 0;
475 }
476
Victor Stinner22eb6892019-06-26 00:51:05 +0200477 if (encoding != NULL) {
478 PyObject *handler = _PyCodec_Lookup(encoding);
479 if (handler == NULL) {
480 return -1;
481 }
482 Py_DECREF(handler);
483 }
484
485 if (errors != NULL) {
486 PyObject *handler = PyCodec_LookupError(errors);
487 if (handler == NULL) {
488 return -1;
489 }
490 Py_DECREF(handler);
491 }
492 return 0;
493}
494
495
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300496/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
497 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000498Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000499PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000500{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000501#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000502 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000503#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000504 /* This is actually an illegal character, so it should
505 not be passed to unichr. */
506 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000507#endif
508}
509
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200510int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100511_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200512{
Victor Stinner68762572019-10-07 18:42:01 +0200513#define CHECK(expr) \
514 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
515
Victor Stinner910337b2011-10-03 03:20:16 +0200516 PyASCIIObject *ascii;
517 unsigned int kind;
518
Victor Stinner68762572019-10-07 18:42:01 +0200519 assert(op != NULL);
520 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200521
522 ascii = (PyASCIIObject *)op;
523 kind = ascii->state.kind;
524
Victor Stinnera3b334d2011-10-03 13:53:37 +0200525 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200526 CHECK(kind == PyUnicode_1BYTE_KIND);
527 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200528 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200529 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200530 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200531 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200532
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 if (ascii->state.compact == 1) {
534 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200535 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200536 || kind == PyUnicode_2BYTE_KIND
537 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(ascii->state.ascii == 0);
539 CHECK(ascii->state.ready == 1);
540 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100541 }
542 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200543 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
544
545 data = unicode->data.any;
546 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200547 CHECK(ascii->length == 0);
548 CHECK(ascii->hash == -1);
549 CHECK(ascii->state.compact == 0);
550 CHECK(ascii->state.ascii == 0);
551 CHECK(ascii->state.ready == 0);
552 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
553 CHECK(ascii->wstr != NULL);
554 CHECK(data == NULL);
555 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200556 }
557 else {
Victor Stinner68762572019-10-07 18:42:01 +0200558 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200559 || kind == PyUnicode_2BYTE_KIND
560 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(ascii->state.compact == 0);
562 CHECK(ascii->state.ready == 1);
563 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200564 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(compact->utf8 == data);
566 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200567 }
568 else
Victor Stinner68762572019-10-07 18:42:01 +0200569 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 }
571 }
572 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200573 if (
574#if SIZEOF_WCHAR_T == 2
575 kind == PyUnicode_2BYTE_KIND
576#else
577 kind == PyUnicode_4BYTE_KIND
578#endif
579 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200580 {
Victor Stinner68762572019-10-07 18:42:01 +0200581 CHECK(ascii->wstr == data);
582 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200583 } else
Victor Stinner68762572019-10-07 18:42:01 +0200584 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200585 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200586
587 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200588 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200589 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200590 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200591 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200592
593 /* check that the best kind is used: O(n) operation */
594 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200595 Py_ssize_t i;
596 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300597 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200598 Py_UCS4 ch;
599
600 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200601 for (i=0; i < ascii->length; i++)
602 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200603 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200604 if (ch > maxchar)
605 maxchar = ch;
606 }
607 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100608 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200609 CHECK(maxchar >= 128);
610 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100611 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200612 else
Victor Stinner68762572019-10-07 18:42:01 +0200613 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200614 }
Victor Stinner77faf692011-11-20 18:56:05 +0100615 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200616 CHECK(maxchar >= 0x100);
617 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100618 }
619 else {
Victor Stinner68762572019-10-07 18:42:01 +0200620 CHECK(maxchar >= 0x10000);
621 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100622 }
Victor Stinner68762572019-10-07 18:42:01 +0200623 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200624 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400625 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200626
627#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400628}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200629
Victor Stinner910337b2011-10-03 03:20:16 +0200630
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631static PyObject*
632unicode_result_wchar(PyObject *unicode)
633{
634#ifndef Py_DEBUG
635 Py_ssize_t len;
636
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100637 len = _PyUnicode_WSTR_LENGTH(unicode);
638 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100639 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200640 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100641 }
642
643 if (len == 1) {
644 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100645 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100646 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
647 Py_DECREF(unicode);
648 return latin1_char;
649 }
650 }
651
652 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200653 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100654 return NULL;
655 }
656#else
Victor Stinneraa771272012-10-04 02:32:58 +0200657 assert(Py_REFCNT(unicode) == 1);
658
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100659 /* don't make the result ready in debug mode to ensure that the caller
660 makes the string ready before using it */
661 assert(_PyUnicode_CheckConsistency(unicode, 1));
662#endif
663 return unicode;
664}
665
666static PyObject*
667unicode_result_ready(PyObject *unicode)
668{
669 Py_ssize_t length;
670
671 length = PyUnicode_GET_LENGTH(unicode);
672 if (length == 0) {
673 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100674 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200675 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100676 }
677 return unicode_empty;
678 }
679
Victor Stinner607b1022020-05-05 18:50:30 +0200680#ifdef LATIN1_SINGLETONS
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100681 if (length == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300682 const void *data = PyUnicode_DATA(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +0200683 int kind = PyUnicode_KIND(unicode);
684 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100685 if (ch < 256) {
686 PyObject *latin1_char = unicode_latin1[ch];
687 if (latin1_char != NULL) {
688 if (unicode != latin1_char) {
689 Py_INCREF(latin1_char);
690 Py_DECREF(unicode);
691 }
692 return latin1_char;
693 }
694 else {
695 assert(_PyUnicode_CheckConsistency(unicode, 1));
696 Py_INCREF(unicode);
697 unicode_latin1[ch] = unicode;
698 return unicode;
699 }
700 }
701 }
Victor Stinner607b1022020-05-05 18:50:30 +0200702#endif
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100703
704 assert(_PyUnicode_CheckConsistency(unicode, 1));
705 return unicode;
706}
707
708static PyObject*
709unicode_result(PyObject *unicode)
710{
711 assert(_PyUnicode_CHECK(unicode));
712 if (PyUnicode_IS_READY(unicode))
713 return unicode_result_ready(unicode);
714 else
715 return unicode_result_wchar(unicode);
716}
717
Victor Stinnerc4b49542011-12-11 22:44:26 +0100718static PyObject*
719unicode_result_unchanged(PyObject *unicode)
720{
721 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500722 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100723 return NULL;
724 Py_INCREF(unicode);
725 return unicode;
726 }
727 else
728 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100729 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100730}
731
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200732/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
733 ASCII, Latin1, UTF-8, etc. */
734static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200735backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200736 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
737{
Victor Stinnerad771582015-10-09 12:38:53 +0200738 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200739 Py_UCS4 ch;
740 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300741 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200742
743 assert(PyUnicode_IS_READY(unicode));
744 kind = PyUnicode_KIND(unicode);
745 data = PyUnicode_DATA(unicode);
746
747 size = 0;
748 /* determine replacement size */
749 for (i = collstart; i < collend; ++i) {
750 Py_ssize_t incr;
751
752 ch = PyUnicode_READ(kind, data, i);
753 if (ch < 0x100)
754 incr = 2+2;
755 else if (ch < 0x10000)
756 incr = 2+4;
757 else {
758 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200759 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200760 }
761 if (size > PY_SSIZE_T_MAX - incr) {
762 PyErr_SetString(PyExc_OverflowError,
763 "encoded result is too long for a Python string");
764 return NULL;
765 }
766 size += incr;
767 }
768
Victor Stinnerad771582015-10-09 12:38:53 +0200769 str = _PyBytesWriter_Prepare(writer, str, size);
770 if (str == NULL)
771 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200772
773 /* generate replacement */
774 for (i = collstart; i < collend; ++i) {
775 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200776 *str++ = '\\';
777 if (ch >= 0x00010000) {
778 *str++ = 'U';
779 *str++ = Py_hexdigits[(ch>>28)&0xf];
780 *str++ = Py_hexdigits[(ch>>24)&0xf];
781 *str++ = Py_hexdigits[(ch>>20)&0xf];
782 *str++ = Py_hexdigits[(ch>>16)&0xf];
783 *str++ = Py_hexdigits[(ch>>12)&0xf];
784 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200785 }
Victor Stinner797485e2015-10-09 03:17:30 +0200786 else if (ch >= 0x100) {
787 *str++ = 'u';
788 *str++ = Py_hexdigits[(ch>>12)&0xf];
789 *str++ = Py_hexdigits[(ch>>8)&0xf];
790 }
791 else
792 *str++ = 'x';
793 *str++ = Py_hexdigits[(ch>>4)&0xf];
794 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200795 }
796 return str;
797}
798
799/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
800 ASCII, Latin1, UTF-8, etc. */
801static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200802xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200803 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
804{
Victor Stinnerad771582015-10-09 12:38:53 +0200805 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200806 Py_UCS4 ch;
807 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300808 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200809
810 assert(PyUnicode_IS_READY(unicode));
811 kind = PyUnicode_KIND(unicode);
812 data = PyUnicode_DATA(unicode);
813
814 size = 0;
815 /* determine replacement size */
816 for (i = collstart; i < collend; ++i) {
817 Py_ssize_t incr;
818
819 ch = PyUnicode_READ(kind, data, i);
820 if (ch < 10)
821 incr = 2+1+1;
822 else if (ch < 100)
823 incr = 2+2+1;
824 else if (ch < 1000)
825 incr = 2+3+1;
826 else if (ch < 10000)
827 incr = 2+4+1;
828 else if (ch < 100000)
829 incr = 2+5+1;
830 else if (ch < 1000000)
831 incr = 2+6+1;
832 else {
833 assert(ch <= MAX_UNICODE);
834 incr = 2+7+1;
835 }
836 if (size > PY_SSIZE_T_MAX - incr) {
837 PyErr_SetString(PyExc_OverflowError,
838 "encoded result is too long for a Python string");
839 return NULL;
840 }
841 size += incr;
842 }
843
Victor Stinnerad771582015-10-09 12:38:53 +0200844 str = _PyBytesWriter_Prepare(writer, str, size);
845 if (str == NULL)
846 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200847
848 /* generate replacement */
849 for (i = collstart; i < collend; ++i) {
850 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
851 }
852 return str;
853}
854
Thomas Wouters477c8d52006-05-27 19:21:47 +0000855/* --- Bloom Filters ----------------------------------------------------- */
856
857/* stuff to implement simple "bloom filters" for Unicode characters.
858 to keep things simple, we use a single bitmask, using the least 5
859 bits from each unicode characters as the bit index. */
860
861/* the linebreak mask is set up by Unicode_Init below */
862
Antoine Pitrouf068f942010-01-13 14:19:12 +0000863#if LONG_BIT >= 128
864#define BLOOM_WIDTH 128
865#elif LONG_BIT >= 64
866#define BLOOM_WIDTH 64
867#elif LONG_BIT >= 32
868#define BLOOM_WIDTH 32
869#else
870#error "LONG_BIT is smaller than 32"
871#endif
872
Thomas Wouters477c8d52006-05-27 19:21:47 +0000873#define BLOOM_MASK unsigned long
874
Serhiy Storchaka05997252013-01-26 12:14:02 +0200875static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000876
Antoine Pitrouf068f942010-01-13 14:19:12 +0000877#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000878
Benjamin Peterson29060642009-01-31 22:14:21 +0000879#define BLOOM_LINEBREAK(ch) \
880 ((ch) < 128U ? ascii_linebreak[(ch)] : \
881 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700883static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300884make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000885{
Victor Stinnera85af502013-04-09 21:53:54 +0200886#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
887 do { \
888 TYPE *data = (TYPE *)PTR; \
889 TYPE *end = data + LEN; \
890 Py_UCS4 ch; \
891 for (; data != end; data++) { \
892 ch = *data; \
893 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
894 } \
895 break; \
896 } while (0)
897
Thomas Wouters477c8d52006-05-27 19:21:47 +0000898 /* calculate simple bloom-style bitmask for a given unicode string */
899
Antoine Pitrouf068f942010-01-13 14:19:12 +0000900 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000901
902 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200903 switch (kind) {
904 case PyUnicode_1BYTE_KIND:
905 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
906 break;
907 case PyUnicode_2BYTE_KIND:
908 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
909 break;
910 case PyUnicode_4BYTE_KIND:
911 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
912 break;
913 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700914 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200915 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000916 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200917
918#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000919}
920
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300921static int
922ensure_unicode(PyObject *obj)
923{
924 if (!PyUnicode_Check(obj)) {
925 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200926 "must be str, not %.100s",
927 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300928 return -1;
929 }
930 return PyUnicode_READY(obj);
931}
932
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200933/* Compilation of templated routines */
934
935#include "stringlib/asciilib.h"
936#include "stringlib/fastsearch.h"
937#include "stringlib/partition.h"
938#include "stringlib/split.h"
939#include "stringlib/count.h"
940#include "stringlib/find.h"
941#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200942#include "stringlib/undef.h"
943
944#include "stringlib/ucs1lib.h"
945#include "stringlib/fastsearch.h"
946#include "stringlib/partition.h"
947#include "stringlib/split.h"
948#include "stringlib/count.h"
949#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300950#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200951#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200952#include "stringlib/undef.h"
953
954#include "stringlib/ucs2lib.h"
955#include "stringlib/fastsearch.h"
956#include "stringlib/partition.h"
957#include "stringlib/split.h"
958#include "stringlib/count.h"
959#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300960#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200961#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200962#include "stringlib/undef.h"
963
964#include "stringlib/ucs4lib.h"
965#include "stringlib/fastsearch.h"
966#include "stringlib/partition.h"
967#include "stringlib/split.h"
968#include "stringlib/count.h"
969#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300970#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200971#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200972#include "stringlib/undef.h"
973
Inada Naoki610a60c2020-06-18 17:30:53 +0900974_Py_COMP_DIAG_PUSH
975_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200976#include "stringlib/unicodedefs.h"
977#include "stringlib/fastsearch.h"
978#include "stringlib/count.h"
979#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100980#include "stringlib/undef.h"
Inada Naoki610a60c2020-06-18 17:30:53 +0900981_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200982
Guido van Rossumd57fd912000-03-10 22:53:23 +0000983/* --- Unicode Object ----------------------------------------------------- */
984
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700985static inline Py_ssize_t
986findchar(const void *s, int kind,
987 Py_ssize_t size, Py_UCS4 ch,
988 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200990 switch (kind) {
991 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 if ((Py_UCS1) ch != ch)
993 return -1;
994 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600995 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200996 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600997 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200998 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 if ((Py_UCS2) ch != ch)
1000 return -1;
1001 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001002 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001003 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001004 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001005 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001006 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001007 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001008 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001009 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001010 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001011 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013}
1014
Victor Stinnerafffce42012-10-03 23:03:17 +02001015#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001016/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001017 earlier.
1018
1019 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1020 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1021 invalid character in Unicode 6.0. */
1022static void
1023unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1024{
1025 int kind = PyUnicode_KIND(unicode);
1026 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1027 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1028 if (length <= old_length)
1029 return;
1030 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1031}
1032#endif
1033
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034static PyObject*
1035resize_compact(PyObject *unicode, Py_ssize_t length)
1036{
1037 Py_ssize_t char_size;
1038 Py_ssize_t struct_size;
1039 Py_ssize_t new_size;
1040 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001041 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001042#ifdef Py_DEBUG
1043 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1044#endif
1045
Victor Stinner79891572012-05-03 13:43:07 +02001046 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001048 assert(PyUnicode_IS_COMPACT(unicode));
1049
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001050 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001051 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052 struct_size = sizeof(PyASCIIObject);
1053 else
1054 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001055 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056
Victor Stinnerfe226c02011-10-03 03:52:20 +02001057 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1058 PyErr_NoMemory();
1059 return NULL;
1060 }
1061 new_size = (struct_size + (length + 1) * char_size);
1062
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001063 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1064 PyObject_DEL(_PyUnicode_UTF8(unicode));
1065 _PyUnicode_UTF8(unicode) = NULL;
1066 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1067 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001068#ifdef Py_REF_DEBUG
1069 _Py_RefTotal--;
1070#endif
1071#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001072 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001073#endif
Victor Stinner84def372011-12-11 20:04:56 +01001074
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001075 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001076 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001077 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 PyErr_NoMemory();
1079 return NULL;
1080 }
Victor Stinner84def372011-12-11 20:04:56 +01001081 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001083
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001085 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001086 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001087 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001088 _PyUnicode_WSTR_LENGTH(unicode) = length;
1089 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001090 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1091 PyObject_DEL(_PyUnicode_WSTR(unicode));
1092 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001093 if (!PyUnicode_IS_ASCII(unicode))
1094 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001095 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001096#ifdef Py_DEBUG
1097 unicode_fill_invalid(unicode, old_length);
1098#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001099 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1100 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001101 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001102 return unicode;
1103}
1104
Alexander Belopolsky40018472011-02-26 01:02:56 +00001105static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001106resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107{
Victor Stinner95663112011-10-04 01:03:50 +02001108 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001109 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001111 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001112
Victor Stinnerfe226c02011-10-03 03:52:20 +02001113 if (PyUnicode_IS_READY(unicode)) {
1114 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001115 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001117#ifdef Py_DEBUG
1118 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1119#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001120
1121 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001122 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001123 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1124 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001125
1126 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1127 PyErr_NoMemory();
1128 return -1;
1129 }
1130 new_size = (length + 1) * char_size;
1131
Victor Stinner7a9105a2011-12-12 00:13:42 +01001132 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1133 {
1134 PyObject_DEL(_PyUnicode_UTF8(unicode));
1135 _PyUnicode_UTF8(unicode) = NULL;
1136 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1137 }
1138
Victor Stinnerfe226c02011-10-03 03:52:20 +02001139 data = (PyObject *)PyObject_REALLOC(data, new_size);
1140 if (data == NULL) {
1141 PyErr_NoMemory();
1142 return -1;
1143 }
1144 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001145 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001146 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_WSTR_LENGTH(unicode) = length;
1148 }
1149 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001150 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001151 _PyUnicode_UTF8_LENGTH(unicode) = length;
1152 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001153 _PyUnicode_LENGTH(unicode) = length;
1154 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001155#ifdef Py_DEBUG
1156 unicode_fill_invalid(unicode, old_length);
1157#endif
Victor Stinner95663112011-10-04 01:03:50 +02001158 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001159 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001160 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001161 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001162 }
Victor Stinner95663112011-10-04 01:03:50 +02001163 assert(_PyUnicode_WSTR(unicode) != NULL);
1164
1165 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001166 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001167 PyErr_NoMemory();
1168 return -1;
1169 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001170 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001171 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001172 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001173 if (!wstr) {
1174 PyErr_NoMemory();
1175 return -1;
1176 }
1177 _PyUnicode_WSTR(unicode) = wstr;
1178 _PyUnicode_WSTR(unicode)[length] = 0;
1179 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001180 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 return 0;
1182}
1183
Victor Stinnerfe226c02011-10-03 03:52:20 +02001184static PyObject*
1185resize_copy(PyObject *unicode, Py_ssize_t length)
1186{
1187 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001188 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001189 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001190
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001191 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001192
1193 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1194 if (copy == NULL)
1195 return NULL;
1196
1197 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001198 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001199 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001200 }
1201 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001202 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001203
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001204 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001205 if (w == NULL)
1206 return NULL;
1207 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1208 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001209 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001210 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001211 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001212 }
1213}
1214
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001216 Ux0000 terminated; some code (e.g. new_identifier)
1217 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218
1219 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001220 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221
1222*/
1223
Alexander Belopolsky40018472011-02-26 01:02:56 +00001224static PyUnicodeObject *
1225_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001227 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229
Thomas Wouters477c8d52006-05-27 19:21:47 +00001230 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 if (length == 0 && unicode_empty != NULL) {
1232 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001233 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 }
1235
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001236 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001237 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001238 return (PyUnicodeObject *)PyErr_NoMemory();
1239 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001240 if (length < 0) {
1241 PyErr_SetString(PyExc_SystemError,
1242 "Negative size passed to _PyUnicode_New");
1243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244 }
1245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1247 if (unicode == NULL)
1248 return NULL;
1249 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001250
1251 _PyUnicode_WSTR_LENGTH(unicode) = length;
1252 _PyUnicode_HASH(unicode) = -1;
1253 _PyUnicode_STATE(unicode).interned = 0;
1254 _PyUnicode_STATE(unicode).kind = 0;
1255 _PyUnicode_STATE(unicode).compact = 0;
1256 _PyUnicode_STATE(unicode).ready = 0;
1257 _PyUnicode_STATE(unicode).ascii = 0;
1258 _PyUnicode_DATA_ANY(unicode) = NULL;
1259 _PyUnicode_LENGTH(unicode) = 0;
1260 _PyUnicode_UTF8(unicode) = NULL;
1261 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1264 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001265 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001266 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001267 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001268 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269
Jeremy Hyltond8082792003-09-16 19:41:39 +00001270 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001271 * the caller fails before initializing str -- unicode_resize()
1272 * reads str[0], and the Keep-Alive optimization can keep memory
1273 * allocated for str alive across a call to unicode_dealloc(unicode).
1274 * We don't want unicode_resize to read uninitialized memory in
1275 * that case.
1276 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001277 _PyUnicode_WSTR(unicode)[0] = 0;
1278 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001279
Victor Stinner7931d9a2011-11-04 00:22:48 +01001280 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 return unicode;
1282}
1283
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284static const char*
1285unicode_kind_name(PyObject *unicode)
1286{
Victor Stinner42dfd712011-10-03 14:41:45 +02001287 /* don't check consistency: unicode_kind_name() is called from
1288 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 if (!PyUnicode_IS_COMPACT(unicode))
1290 {
1291 if (!PyUnicode_IS_READY(unicode))
1292 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001293 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001294 {
1295 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001296 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001297 return "legacy ascii";
1298 else
1299 return "legacy latin1";
1300 case PyUnicode_2BYTE_KIND:
1301 return "legacy UCS2";
1302 case PyUnicode_4BYTE_KIND:
1303 return "legacy UCS4";
1304 default:
1305 return "<legacy invalid kind>";
1306 }
1307 }
1308 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001309 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001310 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001311 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001312 return "ascii";
1313 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001314 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001315 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001316 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001317 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001318 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001319 default:
1320 return "<invalid compact kind>";
1321 }
1322}
1323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001326const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001327 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001328 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329}
1330
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001331const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001332 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 return _PyUnicode_COMPACT_DATA(unicode);
1334}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001335const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001336 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001337 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1339 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1340 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1341 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1342 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1343 return PyUnicode_DATA(unicode);
1344}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001345
1346void
1347_PyUnicode_Dump(PyObject *op)
1348{
1349 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001350 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1351 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001352 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001353
Victor Stinnera849a4b2011-10-03 12:12:11 +02001354 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001355 {
1356 if (ascii->state.ascii)
1357 data = (ascii + 1);
1358 else
1359 data = (compact + 1);
1360 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001361 else
1362 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001363 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1364 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001365
Victor Stinnera849a4b2011-10-03 12:12:11 +02001366 if (ascii->wstr == data)
1367 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001368 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001369
Victor Stinnera3b334d2011-10-03 13:53:37 +02001370 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001371 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001372 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1373 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001374 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001375 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001376 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001377 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001378}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379#endif
1380
1381PyObject *
1382PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1383{
1384 PyObject *obj;
1385 PyCompactUnicodeObject *unicode;
1386 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001387 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001388 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 Py_ssize_t char_size;
1390 Py_ssize_t struct_size;
1391
1392 /* Optimization for empty strings */
1393 if (size == 0 && unicode_empty != NULL) {
1394 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001395 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 }
1397
Victor Stinner9e9d6892011-10-04 01:02:02 +02001398 is_ascii = 0;
1399 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 struct_size = sizeof(PyCompactUnicodeObject);
1401 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001402 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 char_size = 1;
1404 is_ascii = 1;
1405 struct_size = sizeof(PyASCIIObject);
1406 }
1407 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001408 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 char_size = 1;
1410 }
1411 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001412 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 char_size = 2;
1414 if (sizeof(wchar_t) == 2)
1415 is_sharing = 1;
1416 }
1417 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001418 if (maxchar > MAX_UNICODE) {
1419 PyErr_SetString(PyExc_SystemError,
1420 "invalid maximum character passed to PyUnicode_New");
1421 return NULL;
1422 }
Victor Stinner8f825062012-04-27 13:55:39 +02001423 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 char_size = 4;
1425 if (sizeof(wchar_t) == 4)
1426 is_sharing = 1;
1427 }
1428
1429 /* Ensure we won't overflow the size. */
1430 if (size < 0) {
1431 PyErr_SetString(PyExc_SystemError,
1432 "Negative size passed to PyUnicode_New");
1433 return NULL;
1434 }
1435 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1436 return PyErr_NoMemory();
1437
1438 /* Duplicated allocation code from _PyObject_New() instead of a call to
1439 * PyObject_New() so we are able to allocate space for the object and
1440 * it's data buffer.
1441 */
1442 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1443 if (obj == NULL)
1444 return PyErr_NoMemory();
1445 obj = PyObject_INIT(obj, &PyUnicode_Type);
1446 if (obj == NULL)
1447 return NULL;
1448
1449 unicode = (PyCompactUnicodeObject *)obj;
1450 if (is_ascii)
1451 data = ((PyASCIIObject*)obj) + 1;
1452 else
1453 data = unicode + 1;
1454 _PyUnicode_LENGTH(unicode) = size;
1455 _PyUnicode_HASH(unicode) = -1;
1456 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001457 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 _PyUnicode_STATE(unicode).compact = 1;
1459 _PyUnicode_STATE(unicode).ready = 1;
1460 _PyUnicode_STATE(unicode).ascii = is_ascii;
1461 if (is_ascii) {
1462 ((char*)data)[size] = 0;
1463 _PyUnicode_WSTR(unicode) = NULL;
1464 }
Victor Stinner8f825062012-04-27 13:55:39 +02001465 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 ((char*)data)[size] = 0;
1467 _PyUnicode_WSTR(unicode) = NULL;
1468 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001470 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 else {
1473 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001474 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001475 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001477 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 ((Py_UCS4*)data)[size] = 0;
1479 if (is_sharing) {
1480 _PyUnicode_WSTR_LENGTH(unicode) = size;
1481 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1482 }
1483 else {
1484 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1485 _PyUnicode_WSTR(unicode) = NULL;
1486 }
1487 }
Victor Stinner8f825062012-04-27 13:55:39 +02001488#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001489 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001490#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001491 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 return obj;
1493}
1494
1495#if SIZEOF_WCHAR_T == 2
1496/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1497 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001498 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499
1500 This function assumes that unicode can hold one more code point than wstr
1501 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001502static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001504 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505{
1506 const wchar_t *iter;
1507 Py_UCS4 *ucs4_out;
1508
Victor Stinner910337b2011-10-03 03:20:16 +02001509 assert(unicode != NULL);
1510 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1512 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1513
1514 for (iter = begin; iter < end; ) {
1515 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1516 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001517 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1518 && (iter+1) < end
1519 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 {
Victor Stinner551ac952011-11-29 22:58:13 +01001521 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522 iter += 2;
1523 }
1524 else {
1525 *ucs4_out++ = *iter;
1526 iter++;
1527 }
1528 }
1529 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1530 _PyUnicode_GET_LENGTH(unicode)));
1531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532}
1533#endif
1534
Victor Stinnercd9950f2011-10-02 00:34:53 +02001535static int
Victor Stinner488fa492011-12-12 00:01:39 +01001536unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001537{
Victor Stinner488fa492011-12-12 00:01:39 +01001538 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001539 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001540 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001541 return -1;
1542 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001543 return 0;
1544}
1545
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546static int
1547_copy_characters(PyObject *to, Py_ssize_t to_start,
1548 PyObject *from, Py_ssize_t from_start,
1549 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001551 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001552 const void *from_data;
1553 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554
Victor Stinneree4544c2012-05-09 22:24:08 +02001555 assert(0 <= how_many);
1556 assert(0 <= from_start);
1557 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001559 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001560 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001561
Victor Stinnerd3f08822012-05-29 12:57:52 +02001562 assert(PyUnicode_Check(to));
1563 assert(PyUnicode_IS_READY(to));
1564 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1565
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001566 if (how_many == 0)
1567 return 0;
1568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001570 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001571 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001572 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573
Victor Stinnerf1852262012-06-16 16:38:26 +02001574#ifdef Py_DEBUG
1575 if (!check_maxchar
1576 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1577 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001578 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001579 Py_UCS4 ch;
1580 Py_ssize_t i;
1581 for (i=0; i < how_many; i++) {
1582 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1583 assert(ch <= to_maxchar);
1584 }
1585 }
1586#endif
1587
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001588 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001589 if (check_maxchar
1590 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1591 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001592 /* Writing Latin-1 characters into an ASCII string requires to
1593 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001594 Py_UCS4 max_char;
1595 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001596 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001597 if (max_char >= 128)
1598 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001599 }
Christian Heimesf051e432016-09-13 20:22:02 +02001600 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001601 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001602 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001604 else if (from_kind == PyUnicode_1BYTE_KIND
1605 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001606 {
1607 _PyUnicode_CONVERT_BYTES(
1608 Py_UCS1, Py_UCS2,
1609 PyUnicode_1BYTE_DATA(from) + from_start,
1610 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1611 PyUnicode_2BYTE_DATA(to) + to_start
1612 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001613 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001614 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001615 && to_kind == PyUnicode_4BYTE_KIND)
1616 {
1617 _PyUnicode_CONVERT_BYTES(
1618 Py_UCS1, Py_UCS4,
1619 PyUnicode_1BYTE_DATA(from) + from_start,
1620 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1621 PyUnicode_4BYTE_DATA(to) + to_start
1622 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001623 }
1624 else if (from_kind == PyUnicode_2BYTE_KIND
1625 && to_kind == PyUnicode_4BYTE_KIND)
1626 {
1627 _PyUnicode_CONVERT_BYTES(
1628 Py_UCS2, Py_UCS4,
1629 PyUnicode_2BYTE_DATA(from) + from_start,
1630 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1631 PyUnicode_4BYTE_DATA(to) + to_start
1632 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001633 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001634 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001635 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1636
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001637 if (!check_maxchar) {
1638 if (from_kind == PyUnicode_2BYTE_KIND
1639 && to_kind == PyUnicode_1BYTE_KIND)
1640 {
1641 _PyUnicode_CONVERT_BYTES(
1642 Py_UCS2, Py_UCS1,
1643 PyUnicode_2BYTE_DATA(from) + from_start,
1644 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1645 PyUnicode_1BYTE_DATA(to) + to_start
1646 );
1647 }
1648 else if (from_kind == PyUnicode_4BYTE_KIND
1649 && to_kind == PyUnicode_1BYTE_KIND)
1650 {
1651 _PyUnicode_CONVERT_BYTES(
1652 Py_UCS4, Py_UCS1,
1653 PyUnicode_4BYTE_DATA(from) + from_start,
1654 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1655 PyUnicode_1BYTE_DATA(to) + to_start
1656 );
1657 }
1658 else if (from_kind == PyUnicode_4BYTE_KIND
1659 && to_kind == PyUnicode_2BYTE_KIND)
1660 {
1661 _PyUnicode_CONVERT_BYTES(
1662 Py_UCS4, Py_UCS2,
1663 PyUnicode_4BYTE_DATA(from) + from_start,
1664 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1665 PyUnicode_2BYTE_DATA(to) + to_start
1666 );
1667 }
1668 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001669 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001670 }
1671 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001672 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001673 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001674 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001675 Py_ssize_t i;
1676
Victor Stinnera0702ab2011-09-29 14:14:38 +02001677 for (i=0; i < how_many; i++) {
1678 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001679 if (ch > to_maxchar)
1680 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001681 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1682 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001683 }
1684 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001685 return 0;
1686}
1687
Victor Stinnerd3f08822012-05-29 12:57:52 +02001688void
1689_PyUnicode_FastCopyCharacters(
1690 PyObject *to, Py_ssize_t to_start,
1691 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001692{
1693 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1694}
1695
1696Py_ssize_t
1697PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1698 PyObject *from, Py_ssize_t from_start,
1699 Py_ssize_t how_many)
1700{
1701 int err;
1702
1703 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1704 PyErr_BadInternalCall();
1705 return -1;
1706 }
1707
Benjamin Petersonbac79492012-01-14 13:34:47 -05001708 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001709 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001710 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001711 return -1;
1712
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001713 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001714 PyErr_SetString(PyExc_IndexError, "string index out of range");
1715 return -1;
1716 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001717 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001718 PyErr_SetString(PyExc_IndexError, "string index out of range");
1719 return -1;
1720 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001721 if (how_many < 0) {
1722 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1723 return -1;
1724 }
1725 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001726 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1727 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001728 "Cannot write %zi characters at %zi "
1729 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001730 how_many, to_start, PyUnicode_GET_LENGTH(to));
1731 return -1;
1732 }
1733
1734 if (how_many == 0)
1735 return 0;
1736
Victor Stinner488fa492011-12-12 00:01:39 +01001737 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001738 return -1;
1739
1740 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1741 if (err) {
1742 PyErr_Format(PyExc_SystemError,
1743 "Cannot copy %s characters "
1744 "into a string of %s characters",
1745 unicode_kind_name(from),
1746 unicode_kind_name(to));
1747 return -1;
1748 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001749 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750}
1751
Victor Stinner17222162011-09-28 22:15:37 +02001752/* Find the maximum code point and count the number of surrogate pairs so a
1753 correct string length can be computed before converting a string to UCS4.
1754 This function counts single surrogates as a character and not as a pair.
1755
1756 Return 0 on success, or -1 on error. */
1757static int
1758find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1759 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760{
1761 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001762 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763
Victor Stinnerc53be962011-10-02 21:33:54 +02001764 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 *num_surrogates = 0;
1766 *maxchar = 0;
1767
1768 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001770 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1771 && (iter+1) < end
1772 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1773 {
1774 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1775 ++(*num_surrogates);
1776 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 }
1778 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001780 {
1781 ch = *iter;
1782 iter++;
1783 }
1784 if (ch > *maxchar) {
1785 *maxchar = ch;
1786 if (*maxchar > MAX_UNICODE) {
1787 PyErr_Format(PyExc_ValueError,
1788 "character U+%x is not in range [U+0000; U+10ffff]",
1789 ch);
1790 return -1;
1791 }
1792 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 }
1794 return 0;
1795}
1796
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001797int
1798_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799{
1800 wchar_t *end;
1801 Py_UCS4 maxchar = 0;
1802 Py_ssize_t num_surrogates;
1803#if SIZEOF_WCHAR_T == 2
1804 Py_ssize_t length_wo_surrogates;
1805#endif
1806
Georg Brandl7597add2011-10-05 16:36:47 +02001807 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001808 strings were created using _PyObject_New() and where no canonical
1809 representation (the str field) has been set yet aka strings
1810 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001811 assert(_PyUnicode_CHECK(unicode));
1812 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001814 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001815 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001816 /* Actually, it should neither be interned nor be anything else: */
1817 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001820 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001821 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823
1824 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001825 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1826 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 PyErr_NoMemory();
1828 return -1;
1829 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001830 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 _PyUnicode_WSTR(unicode), end,
1832 PyUnicode_1BYTE_DATA(unicode));
1833 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1834 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1835 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1836 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001837 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001838 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001839 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 }
1841 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001842 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001843 _PyUnicode_UTF8(unicode) = NULL;
1844 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 }
1846 PyObject_FREE(_PyUnicode_WSTR(unicode));
1847 _PyUnicode_WSTR(unicode) = NULL;
1848 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1849 }
1850 /* In this case we might have to convert down from 4-byte native
1851 wchar_t to 2-byte unicode. */
1852 else if (maxchar < 65536) {
1853 assert(num_surrogates == 0 &&
1854 "FindMaxCharAndNumSurrogatePairs() messed up");
1855
Victor Stinner506f5922011-09-28 22:34:18 +02001856#if SIZEOF_WCHAR_T == 2
1857 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001858 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001859 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1860 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1861 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001862 _PyUnicode_UTF8(unicode) = NULL;
1863 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001864#else
1865 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001866 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001867 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001868 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001869 PyErr_NoMemory();
1870 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871 }
Victor Stinner506f5922011-09-28 22:34:18 +02001872 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1873 _PyUnicode_WSTR(unicode), end,
1874 PyUnicode_2BYTE_DATA(unicode));
1875 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1876 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1877 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001878 _PyUnicode_UTF8(unicode) = NULL;
1879 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001880 PyObject_FREE(_PyUnicode_WSTR(unicode));
1881 _PyUnicode_WSTR(unicode) = NULL;
1882 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1883#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884 }
1885 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1886 else {
1887#if SIZEOF_WCHAR_T == 2
1888 /* in case the native representation is 2-bytes, we need to allocate a
1889 new normalized 4-byte version. */
1890 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001891 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1892 PyErr_NoMemory();
1893 return -1;
1894 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001895 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1896 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 PyErr_NoMemory();
1898 return -1;
1899 }
1900 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1901 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001902 _PyUnicode_UTF8(unicode) = NULL;
1903 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001904 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1905 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001906 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 PyObject_FREE(_PyUnicode_WSTR(unicode));
1908 _PyUnicode_WSTR(unicode) = NULL;
1909 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1910#else
1911 assert(num_surrogates == 0);
1912
Victor Stinnerc3c74152011-10-02 20:39:55 +02001913 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001915 _PyUnicode_UTF8(unicode) = NULL;
1916 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001917 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1918#endif
1919 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1920 }
1921 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001922 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 return 0;
1924}
1925
Alexander Belopolsky40018472011-02-26 01:02:56 +00001926static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001927unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928{
Walter Dörwald16807132007-05-25 13:52:07 +00001929 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001930 case SSTATE_NOT_INTERNED:
1931 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001932
Benjamin Peterson29060642009-01-31 22:14:21 +00001933 case SSTATE_INTERNED_MORTAL:
1934 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001935 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001936#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001937 if (PyDict_DelItem(interned, unicode) != 0) {
1938 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1939 NULL);
1940 }
Victor Stinner607b1022020-05-05 18:50:30 +02001941#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001943
Benjamin Peterson29060642009-01-31 22:14:21 +00001944 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001945 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1946 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001947
Benjamin Peterson29060642009-01-31 22:14:21 +00001948 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001949 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001950 }
1951
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001952 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001954 }
1955 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001956 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001957 }
1958 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001959 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001962 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963}
1964
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001965#ifdef Py_DEBUG
1966static int
1967unicode_is_singleton(PyObject *unicode)
1968{
Victor Stinner607b1022020-05-05 18:50:30 +02001969 if (unicode == unicode_empty) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001970 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001971 }
1972#ifdef LATIN1_SINGLETONS
1973 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001974 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1975 {
1976 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1977 if (ch < 256 && unicode_latin1[ch] == unicode)
1978 return 1;
1979 }
Victor Stinner607b1022020-05-05 18:50:30 +02001980#endif
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001981 return 0;
1982}
1983#endif
1984
Alexander Belopolsky40018472011-02-26 01:02:56 +00001985static int
Victor Stinner488fa492011-12-12 00:01:39 +01001986unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001987{
Victor Stinner488fa492011-12-12 00:01:39 +01001988 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001989 if (Py_REFCNT(unicode) != 1)
1990 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001991 if (_PyUnicode_HASH(unicode) != -1)
1992 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001993 if (PyUnicode_CHECK_INTERNED(unicode))
1994 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001995 if (!PyUnicode_CheckExact(unicode))
1996 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001997#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001998 /* singleton refcount is greater than 1 */
1999 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002000#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002001 return 1;
2002}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002003
Victor Stinnerfe226c02011-10-03 03:52:20 +02002004static int
2005unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2006{
2007 PyObject *unicode;
2008 Py_ssize_t old_length;
2009
2010 assert(p_unicode != NULL);
2011 unicode = *p_unicode;
2012
2013 assert(unicode != NULL);
2014 assert(PyUnicode_Check(unicode));
2015 assert(0 <= length);
2016
Victor Stinner910337b2011-10-03 03:20:16 +02002017 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002018 old_length = PyUnicode_WSTR_LENGTH(unicode);
2019 else
2020 old_length = PyUnicode_GET_LENGTH(unicode);
2021 if (old_length == length)
2022 return 0;
2023
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002024 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002025 _Py_INCREF_UNICODE_EMPTY();
2026 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00002027 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002028 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002029 return 0;
2030 }
2031
Victor Stinner488fa492011-12-12 00:01:39 +01002032 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002033 PyObject *copy = resize_copy(unicode, length);
2034 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002035 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002036 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002037 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002038 }
2039
Victor Stinnerfe226c02011-10-03 03:52:20 +02002040 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002041 PyObject *new_unicode = resize_compact(unicode, length);
2042 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002043 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002044 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002045 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002046 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002047 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002048}
2049
Alexander Belopolsky40018472011-02-26 01:02:56 +00002050int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002051PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002052{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002053 PyObject *unicode;
2054 if (p_unicode == NULL) {
2055 PyErr_BadInternalCall();
2056 return -1;
2057 }
2058 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002059 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002060 {
2061 PyErr_BadInternalCall();
2062 return -1;
2063 }
2064 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002065}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002066
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002067/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002068
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002069 WARNING: The function doesn't copy the terminating null character and
2070 doesn't check the maximum character (may write a latin1 character in an
2071 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002072static void
2073unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2074 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002075{
2076 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002077 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002078 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002079
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002080 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002081 switch (kind) {
2082 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002083#ifdef Py_DEBUG
2084 if (PyUnicode_IS_ASCII(unicode)) {
2085 Py_UCS4 maxchar = ucs1lib_find_max_char(
2086 (const Py_UCS1*)str,
2087 (const Py_UCS1*)str + len);
2088 assert(maxchar < 128);
2089 }
2090#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002091 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002092 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002093 }
2094 case PyUnicode_2BYTE_KIND: {
2095 Py_UCS2 *start = (Py_UCS2 *)data + index;
2096 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002097
Victor Stinner184252a2012-06-16 02:57:41 +02002098 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002099 *ucs2 = (Py_UCS2)*str;
2100
2101 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002102 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002104 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002105 Py_UCS4 *start = (Py_UCS4 *)data + index;
2106 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002107
Victor Stinner184252a2012-06-16 02:57:41 +02002108 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 *ucs4 = (Py_UCS4)*str;
2110
2111 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002112 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002113 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002114 default:
2115 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002116 }
2117}
2118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119static PyObject*
2120get_latin1_char(unsigned char ch)
2121{
Victor Stinner607b1022020-05-05 18:50:30 +02002122 PyObject *unicode;
2123
2124#ifdef LATIN1_SINGLETONS
2125 unicode = unicode_latin1[ch];
2126 if (unicode) {
2127 Py_INCREF(unicode);
2128 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 }
Victor Stinner607b1022020-05-05 18:50:30 +02002130#endif
2131
2132 unicode = PyUnicode_New(1, ch);
2133 if (!unicode) {
2134 return NULL;
2135 }
2136
2137 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2138 assert(_PyUnicode_CheckConsistency(unicode, 1));
2139
2140#ifdef LATIN1_SINGLETONS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002141 Py_INCREF(unicode);
Victor Stinner607b1022020-05-05 18:50:30 +02002142 unicode_latin1[ch] = unicode;
2143#endif
Victor Stinnera464fc12011-10-02 20:39:30 +02002144 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145}
2146
Victor Stinner985a82a2014-01-03 12:53:47 +01002147static PyObject*
2148unicode_char(Py_UCS4 ch)
2149{
2150 PyObject *unicode;
2151
2152 assert(ch <= MAX_UNICODE);
2153
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002154 if (ch < 256)
2155 return get_latin1_char(ch);
2156
Victor Stinner985a82a2014-01-03 12:53:47 +01002157 unicode = PyUnicode_New(1, ch);
2158 if (unicode == NULL)
2159 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002160
2161 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2162 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002163 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002164 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002165 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2166 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2167 }
2168 assert(_PyUnicode_CheckConsistency(unicode, 1));
2169 return unicode;
2170}
2171
Alexander Belopolsky40018472011-02-26 01:02:56 +00002172PyObject *
2173PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002175 if (u == NULL)
2176 return (PyObject*)_PyUnicode_New(size);
2177
2178 if (size < 0) {
2179 PyErr_BadInternalCall();
2180 return NULL;
2181 }
2182
2183 return PyUnicode_FromWideChar(u, size);
2184}
2185
2186PyObject *
2187PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2188{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002189 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 Py_UCS4 maxchar = 0;
2191 Py_ssize_t num_surrogates;
2192
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002193 if (u == NULL && size != 0) {
2194 PyErr_BadInternalCall();
2195 return NULL;
2196 }
2197
2198 if (size == -1) {
2199 size = wcslen(u);
2200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002202 /* If the Unicode data is known at construction time, we can apply
2203 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002206 if (size == 0)
2207 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 /* Single character Unicode objects in the Latin-1 range are
2210 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002211 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 return get_latin1_char((unsigned char)*u);
2213
2214 /* If not empty and not single character, copy the Unicode data
2215 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002216 if (find_maxchar_surrogates(u, u + size,
2217 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 return NULL;
2219
Victor Stinner8faf8212011-12-08 22:14:11 +01002220 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 if (!unicode)
2222 return NULL;
2223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 switch (PyUnicode_KIND(unicode)) {
2225 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002226 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2228 break;
2229 case PyUnicode_2BYTE_KIND:
2230#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002231 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002233 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2235#endif
2236 break;
2237 case PyUnicode_4BYTE_KIND:
2238#if SIZEOF_WCHAR_T == 2
2239 /* This is the only case which has to process surrogates, thus
2240 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002241 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242#else
2243 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002244 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245#endif
2246 break;
2247 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002248 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002251 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252}
2253
Alexander Belopolsky40018472011-02-26 01:02:56 +00002254PyObject *
2255PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002256{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002257 if (size < 0) {
2258 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002259 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002260 return NULL;
2261 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002262 if (u != NULL)
2263 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2264 else
2265 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002266}
2267
Alexander Belopolsky40018472011-02-26 01:02:56 +00002268PyObject *
2269PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002270{
2271 size_t size = strlen(u);
2272 if (size > PY_SSIZE_T_MAX) {
2273 PyErr_SetString(PyExc_OverflowError, "input too long");
2274 return NULL;
2275 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002276 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002277}
2278
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002279PyObject *
2280_PyUnicode_FromId(_Py_Identifier *id)
2281{
2282 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002283 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2284 strlen(id->string),
2285 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002286 if (!id->object)
2287 return NULL;
2288 PyUnicode_InternInPlace(&id->object);
2289 assert(!id->next);
2290 id->next = static_strings;
2291 static_strings = id;
2292 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002293 return id->object;
2294}
2295
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002296static void
2297unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002298{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002299 _Py_Identifier *tmp, *s = static_strings;
2300 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002301 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002302 tmp = s->next;
2303 s->next = NULL;
2304 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002305 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002306 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002307}
2308
Benjamin Peterson0df54292012-03-26 14:50:32 -04002309/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002310
Victor Stinnerd3f08822012-05-29 12:57:52 +02002311PyObject*
2312_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002313{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002314 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002315 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002316 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002317#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002318 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002319#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002320 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002321 }
Victor Stinner785938e2011-12-11 20:09:03 +01002322 unicode = PyUnicode_New(size, 127);
2323 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002324 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002325 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2326 assert(_PyUnicode_CheckConsistency(unicode, 1));
2327 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002328}
2329
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002330static Py_UCS4
2331kind_maxchar_limit(unsigned int kind)
2332{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002333 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002334 case PyUnicode_1BYTE_KIND:
2335 return 0x80;
2336 case PyUnicode_2BYTE_KIND:
2337 return 0x100;
2338 case PyUnicode_4BYTE_KIND:
2339 return 0x10000;
2340 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002341 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002342 }
2343}
2344
Victor Stinner702c7342011-10-05 13:50:52 +02002345static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002346_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002349 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002350
Serhiy Storchaka678db842013-01-26 12:16:36 +02002351 if (size == 0)
2352 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002353 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002354 if (size == 1)
2355 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002356
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002357 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002358 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359 if (!res)
2360 return NULL;
2361 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002362 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002363 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002364}
2365
Victor Stinnere57b1c02011-09-28 22:20:48 +02002366static PyObject*
2367_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002368{
2369 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002370 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002371
Serhiy Storchaka678db842013-01-26 12:16:36 +02002372 if (size == 0)
2373 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002374 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002375 if (size == 1)
2376 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002377
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002378 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002379 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 if (!res)
2381 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002382 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002384 else {
2385 _PyUnicode_CONVERT_BYTES(
2386 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002388 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 return res;
2390}
2391
Victor Stinnere57b1c02011-09-28 22:20:48 +02002392static PyObject*
2393_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394{
2395 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002396 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002397
Serhiy Storchaka678db842013-01-26 12:16:36 +02002398 if (size == 0)
2399 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002400 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002401 if (size == 1)
2402 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002403
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002404 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002405 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 if (!res)
2407 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002408 if (max_char < 256)
2409 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2410 PyUnicode_1BYTE_DATA(res));
2411 else if (max_char < 0x10000)
2412 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2413 PyUnicode_2BYTE_DATA(res));
2414 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002416 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 return res;
2418}
2419
2420PyObject*
2421PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2422{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002423 if (size < 0) {
2424 PyErr_SetString(PyExc_ValueError, "size must be positive");
2425 return NULL;
2426 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002427 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002429 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002431 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002433 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002434 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002435 PyErr_SetString(PyExc_SystemError, "invalid kind");
2436 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438}
2439
Victor Stinnerece58de2012-04-23 23:36:38 +02002440Py_UCS4
2441_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2442{
2443 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002444 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002445
2446 assert(PyUnicode_IS_READY(unicode));
2447 assert(0 <= start);
2448 assert(end <= PyUnicode_GET_LENGTH(unicode));
2449 assert(start <= end);
2450
2451 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2452 return PyUnicode_MAX_CHAR_VALUE(unicode);
2453
2454 if (start == end)
2455 return 127;
2456
Victor Stinner94d558b2012-04-27 22:26:58 +02002457 if (PyUnicode_IS_ASCII(unicode))
2458 return 127;
2459
Victor Stinnerece58de2012-04-23 23:36:38 +02002460 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002461 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002462 endptr = (char *)startptr + end * kind;
2463 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002464 switch(kind) {
2465 case PyUnicode_1BYTE_KIND:
2466 return ucs1lib_find_max_char(startptr, endptr);
2467 case PyUnicode_2BYTE_KIND:
2468 return ucs2lib_find_max_char(startptr, endptr);
2469 case PyUnicode_4BYTE_KIND:
2470 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002471 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002472 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002473 }
2474}
2475
Victor Stinner25a4b292011-10-06 12:31:55 +02002476/* Ensure that a string uses the most efficient storage, if it is not the
2477 case: create a new string with of the right kind. Write NULL into *p_unicode
2478 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002479static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002480unicode_adjust_maxchar(PyObject **p_unicode)
2481{
2482 PyObject *unicode, *copy;
2483 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002484 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002485 unsigned int kind;
2486
2487 assert(p_unicode != NULL);
2488 unicode = *p_unicode;
2489 assert(PyUnicode_IS_READY(unicode));
2490 if (PyUnicode_IS_ASCII(unicode))
2491 return;
2492
2493 len = PyUnicode_GET_LENGTH(unicode);
2494 kind = PyUnicode_KIND(unicode);
2495 if (kind == PyUnicode_1BYTE_KIND) {
2496 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002497 max_char = ucs1lib_find_max_char(u, u + len);
2498 if (max_char >= 128)
2499 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002500 }
2501 else if (kind == PyUnicode_2BYTE_KIND) {
2502 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002503 max_char = ucs2lib_find_max_char(u, u + len);
2504 if (max_char >= 256)
2505 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002506 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002507 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002508 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002509 max_char = ucs4lib_find_max_char(u, u + len);
2510 if (max_char >= 0x10000)
2511 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002512 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002513 else
2514 Py_UNREACHABLE();
2515
Victor Stinner25a4b292011-10-06 12:31:55 +02002516 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002517 if (copy != NULL)
2518 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002519 Py_DECREF(unicode);
2520 *p_unicode = copy;
2521}
2522
Victor Stinner034f6cf2011-09-30 02:26:44 +02002523PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002524_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002525{
Victor Stinner87af4f22011-11-21 23:03:47 +01002526 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002527 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002528
Victor Stinner034f6cf2011-09-30 02:26:44 +02002529 if (!PyUnicode_Check(unicode)) {
2530 PyErr_BadInternalCall();
2531 return NULL;
2532 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002533 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002534 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002535
Victor Stinner87af4f22011-11-21 23:03:47 +01002536 length = PyUnicode_GET_LENGTH(unicode);
2537 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002538 if (!copy)
2539 return NULL;
2540 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2541
Christian Heimesf051e432016-09-13 20:22:02 +02002542 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002543 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002544 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002545 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002546}
2547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548
Victor Stinnerbc603d12011-10-02 01:00:40 +02002549/* Widen Unicode objects to larger buffers. Don't write terminating null
2550 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002552static void*
2553unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002555 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002556
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002557 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002558 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002559 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002560 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002561 if (!result)
2562 return PyErr_NoMemory();
2563 assert(skind == PyUnicode_1BYTE_KIND);
2564 _PyUnicode_CONVERT_BYTES(
2565 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002566 (const Py_UCS1 *)data,
2567 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002568 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002570 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002571 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002572 if (!result)
2573 return PyErr_NoMemory();
2574 if (skind == PyUnicode_2BYTE_KIND) {
2575 _PyUnicode_CONVERT_BYTES(
2576 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002577 (const Py_UCS2 *)data,
2578 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002579 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002581 else {
2582 assert(skind == PyUnicode_1BYTE_KIND);
2583 _PyUnicode_CONVERT_BYTES(
2584 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002585 (const Py_UCS1 *)data,
2586 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002587 result);
2588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002590 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002591 Py_UNREACHABLE();
2592 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594}
2595
2596static Py_UCS4*
2597as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2598 int copy_null)
2599{
2600 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002601 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 Py_ssize_t len, targetlen;
2603 if (PyUnicode_READY(string) == -1)
2604 return NULL;
2605 kind = PyUnicode_KIND(string);
2606 data = PyUnicode_DATA(string);
2607 len = PyUnicode_GET_LENGTH(string);
2608 targetlen = len;
2609 if (copy_null)
2610 targetlen++;
2611 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002612 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 if (!target) {
2614 PyErr_NoMemory();
2615 return NULL;
2616 }
2617 }
2618 else {
2619 if (targetsize < targetlen) {
2620 PyErr_Format(PyExc_SystemError,
2621 "string is longer than the buffer");
2622 if (copy_null && 0 < targetsize)
2623 target[0] = 0;
2624 return NULL;
2625 }
2626 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002627 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002628 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002629 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002631 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002632 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002633 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2634 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002635 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002636 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002637 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002638 else {
2639 Py_UNREACHABLE();
2640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 if (copy_null)
2642 target[len] = 0;
2643 return target;
2644}
2645
2646Py_UCS4*
2647PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2648 int copy_null)
2649{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002650 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651 PyErr_BadInternalCall();
2652 return NULL;
2653 }
2654 return as_ucs4(string, target, targetsize, copy_null);
2655}
2656
2657Py_UCS4*
2658PyUnicode_AsUCS4Copy(PyObject *string)
2659{
2660 return as_ucs4(string, NULL, 0, 1);
2661}
2662
Victor Stinner15a11362012-10-06 23:48:20 +02002663/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002664 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2665 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2666#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002667
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002668static int
2669unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2670 Py_ssize_t width, Py_ssize_t precision)
2671{
2672 Py_ssize_t length, fill, arglen;
2673 Py_UCS4 maxchar;
2674
2675 if (PyUnicode_READY(str) == -1)
2676 return -1;
2677
2678 length = PyUnicode_GET_LENGTH(str);
2679 if ((precision == -1 || precision >= length)
2680 && width <= length)
2681 return _PyUnicodeWriter_WriteStr(writer, str);
2682
2683 if (precision != -1)
2684 length = Py_MIN(precision, length);
2685
2686 arglen = Py_MAX(length, width);
2687 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2688 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2689 else
2690 maxchar = writer->maxchar;
2691
2692 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2693 return -1;
2694
2695 if (width > length) {
2696 fill = width - length;
2697 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2698 return -1;
2699 writer->pos += fill;
2700 }
2701
2702 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2703 str, 0, length);
2704 writer->pos += length;
2705 return 0;
2706}
2707
2708static int
Victor Stinner998b8062018-09-12 00:23:25 +02002709unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002710 Py_ssize_t width, Py_ssize_t precision)
2711{
2712 /* UTF-8 */
2713 Py_ssize_t length;
2714 PyObject *unicode;
2715 int res;
2716
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002717 if (precision == -1) {
2718 length = strlen(str);
2719 }
2720 else {
2721 length = 0;
2722 while (length < precision && str[length]) {
2723 length++;
2724 }
2725 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002726 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2727 if (unicode == NULL)
2728 return -1;
2729
2730 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2731 Py_DECREF(unicode);
2732 return res;
2733}
2734
Victor Stinner96865452011-03-01 23:44:09 +00002735static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002736unicode_fromformat_arg(_PyUnicodeWriter *writer,
2737 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002738{
Victor Stinnere215d962012-10-06 23:03:36 +02002739 const char *p;
2740 Py_ssize_t len;
2741 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002742 Py_ssize_t width;
2743 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002744 int longflag;
2745 int longlongflag;
2746 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002748
2749 p = f;
2750 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002751 zeropad = 0;
2752 if (*f == '0') {
2753 zeropad = 1;
2754 f++;
2755 }
Victor Stinner96865452011-03-01 23:44:09 +00002756
2757 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002758 width = -1;
2759 if (Py_ISDIGIT((unsigned)*f)) {
2760 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002761 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002762 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002763 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002764 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002766 return NULL;
2767 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002768 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002769 f++;
2770 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002771 }
2772 precision = -1;
2773 if (*f == '.') {
2774 f++;
2775 if (Py_ISDIGIT((unsigned)*f)) {
2776 precision = (*f - '0');
2777 f++;
2778 while (Py_ISDIGIT((unsigned)*f)) {
2779 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2780 PyErr_SetString(PyExc_ValueError,
2781 "precision too big");
2782 return NULL;
2783 }
2784 precision = (precision * 10) + (*f - '0');
2785 f++;
2786 }
2787 }
Victor Stinner96865452011-03-01 23:44:09 +00002788 if (*f == '%') {
2789 /* "%.3%s" => f points to "3" */
2790 f--;
2791 }
2792 }
2793 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002794 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002795 f--;
2796 }
Victor Stinner96865452011-03-01 23:44:09 +00002797
2798 /* Handle %ld, %lu, %lld and %llu. */
2799 longflag = 0;
2800 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002801 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002802 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002803 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002804 longflag = 1;
2805 ++f;
2806 }
Victor Stinner96865452011-03-01 23:44:09 +00002807 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002808 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002809 longlongflag = 1;
2810 f += 2;
2811 }
Victor Stinner96865452011-03-01 23:44:09 +00002812 }
2813 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002814 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002815 size_tflag = 1;
2816 ++f;
2817 }
Victor Stinnere215d962012-10-06 23:03:36 +02002818
2819 if (f[1] == '\0')
2820 writer->overallocate = 0;
2821
2822 switch (*f) {
2823 case 'c':
2824 {
2825 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002826 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002827 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002828 "character argument not in range(0x110000)");
2829 return NULL;
2830 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002831 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002832 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002833 break;
2834 }
2835
2836 case 'i':
2837 case 'd':
2838 case 'u':
2839 case 'x':
2840 {
2841 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002842 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002843 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002844
2845 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002846 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002847 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002848 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002849 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002850 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002851 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002852 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002853 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002854 va_arg(*vargs, size_t));
2855 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002856 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002857 va_arg(*vargs, unsigned int));
2858 }
2859 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002860 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002861 }
2862 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002863 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002864 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002865 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002866 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002867 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002868 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002869 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002870 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002871 va_arg(*vargs, Py_ssize_t));
2872 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002873 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002874 va_arg(*vargs, int));
2875 }
2876 assert(len >= 0);
2877
Victor Stinnere215d962012-10-06 23:03:36 +02002878 if (precision < len)
2879 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002880
2881 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002882 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2883 return NULL;
2884
Victor Stinnere215d962012-10-06 23:03:36 +02002885 if (width > precision) {
2886 Py_UCS4 fillchar;
2887 fill = width - precision;
2888 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002889 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2890 return NULL;
2891 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002892 }
Victor Stinner15a11362012-10-06 23:48:20 +02002893 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002894 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002895 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2896 return NULL;
2897 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002898 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002899
Victor Stinner4a587072013-11-19 12:54:53 +01002900 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2901 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002902 break;
2903 }
2904
2905 case 'p':
2906 {
2907 char number[MAX_LONG_LONG_CHARS];
2908
2909 len = sprintf(number, "%p", va_arg(*vargs, void*));
2910 assert(len >= 0);
2911
2912 /* %p is ill-defined: ensure leading 0x. */
2913 if (number[1] == 'X')
2914 number[1] = 'x';
2915 else if (number[1] != 'x') {
2916 memmove(number + 2, number,
2917 strlen(number) + 1);
2918 number[0] = '0';
2919 number[1] = 'x';
2920 len += 2;
2921 }
2922
Victor Stinner4a587072013-11-19 12:54:53 +01002923 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002924 return NULL;
2925 break;
2926 }
2927
2928 case 's':
2929 {
2930 /* UTF-8 */
2931 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002932 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002933 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002934 break;
2935 }
2936
2937 case 'U':
2938 {
2939 PyObject *obj = va_arg(*vargs, PyObject *);
2940 assert(obj && _PyUnicode_CHECK(obj));
2941
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002942 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002943 return NULL;
2944 break;
2945 }
2946
2947 case 'V':
2948 {
2949 PyObject *obj = va_arg(*vargs, PyObject *);
2950 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002951 if (obj) {
2952 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002953 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002954 return NULL;
2955 }
2956 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002957 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002958 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002959 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002960 }
2961 break;
2962 }
2963
2964 case 'S':
2965 {
2966 PyObject *obj = va_arg(*vargs, PyObject *);
2967 PyObject *str;
2968 assert(obj);
2969 str = PyObject_Str(obj);
2970 if (!str)
2971 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002972 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002973 Py_DECREF(str);
2974 return NULL;
2975 }
2976 Py_DECREF(str);
2977 break;
2978 }
2979
2980 case 'R':
2981 {
2982 PyObject *obj = va_arg(*vargs, PyObject *);
2983 PyObject *repr;
2984 assert(obj);
2985 repr = PyObject_Repr(obj);
2986 if (!repr)
2987 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002988 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002989 Py_DECREF(repr);
2990 return NULL;
2991 }
2992 Py_DECREF(repr);
2993 break;
2994 }
2995
2996 case 'A':
2997 {
2998 PyObject *obj = va_arg(*vargs, PyObject *);
2999 PyObject *ascii;
3000 assert(obj);
3001 ascii = PyObject_ASCII(obj);
3002 if (!ascii)
3003 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003004 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003005 Py_DECREF(ascii);
3006 return NULL;
3007 }
3008 Py_DECREF(ascii);
3009 break;
3010 }
3011
3012 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003013 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003014 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003015 break;
3016
3017 default:
3018 /* if we stumble upon an unknown formatting code, copy the rest
3019 of the format string to the output string. (we cannot just
3020 skip the code, since there's no way to know what's in the
3021 argument list) */
3022 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003023 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003024 return NULL;
3025 f = p+len;
3026 return f;
3027 }
3028
3029 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003030 return f;
3031}
3032
Walter Dörwaldd2034312007-05-18 16:29:38 +00003033PyObject *
3034PyUnicode_FromFormatV(const char *format, va_list vargs)
3035{
Victor Stinnere215d962012-10-06 23:03:36 +02003036 va_list vargs2;
3037 const char *f;
3038 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003039
Victor Stinner8f674cc2013-04-17 23:02:17 +02003040 _PyUnicodeWriter_Init(&writer);
3041 writer.min_length = strlen(format) + 100;
3042 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003043
Benjamin Peterson0c212142016-09-20 20:39:33 -07003044 // Copy varags to be able to pass a reference to a subfunction.
3045 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003046
3047 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003048 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003049 f = unicode_fromformat_arg(&writer, f, &vargs2);
3050 if (f == NULL)
3051 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003053 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003054 const char *p;
3055 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003056
Victor Stinnere215d962012-10-06 23:03:36 +02003057 p = f;
3058 do
3059 {
3060 if ((unsigned char)*p > 127) {
3061 PyErr_Format(PyExc_ValueError,
3062 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3063 "string, got a non-ASCII byte: 0x%02x",
3064 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003065 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003066 }
3067 p++;
3068 }
3069 while (*p != '\0' && *p != '%');
3070 len = p - f;
3071
3072 if (*p == '\0')
3073 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003074
3075 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003076 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003077
3078 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003079 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003080 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003081 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003082 return _PyUnicodeWriter_Finish(&writer);
3083
3084 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003085 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003086 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003087 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003088}
3089
Walter Dörwaldd2034312007-05-18 16:29:38 +00003090PyObject *
3091PyUnicode_FromFormat(const char *format, ...)
3092{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003093 PyObject* ret;
3094 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003095
3096#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003097 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003098#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003099 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003100#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003101 ret = PyUnicode_FromFormatV(format, vargs);
3102 va_end(vargs);
3103 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003104}
3105
Serhiy Storchakac46db922018-10-23 22:58:24 +03003106static Py_ssize_t
3107unicode_get_widechar_size(PyObject *unicode)
3108{
3109 Py_ssize_t res;
3110
3111 assert(unicode != NULL);
3112 assert(_PyUnicode_CHECK(unicode));
3113
3114 if (_PyUnicode_WSTR(unicode) != NULL) {
3115 return PyUnicode_WSTR_LENGTH(unicode);
3116 }
3117 assert(PyUnicode_IS_READY(unicode));
3118
3119 res = _PyUnicode_LENGTH(unicode);
3120#if SIZEOF_WCHAR_T == 2
3121 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3122 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3123 const Py_UCS4 *end = s + res;
3124 for (; s < end; ++s) {
3125 if (*s > 0xFFFF) {
3126 ++res;
3127 }
3128 }
3129 }
3130#endif
3131 return res;
3132}
3133
3134static void
3135unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3136{
3137 const wchar_t *wstr;
3138
3139 assert(unicode != NULL);
3140 assert(_PyUnicode_CHECK(unicode));
3141
3142 wstr = _PyUnicode_WSTR(unicode);
3143 if (wstr != NULL) {
3144 memcpy(w, wstr, size * sizeof(wchar_t));
3145 return;
3146 }
3147 assert(PyUnicode_IS_READY(unicode));
3148
3149 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3150 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3151 for (; size--; ++s, ++w) {
3152 *w = *s;
3153 }
3154 }
3155 else {
3156#if SIZEOF_WCHAR_T == 4
3157 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3158 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3159 for (; size--; ++s, ++w) {
3160 *w = *s;
3161 }
3162#else
3163 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3164 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3165 for (; size--; ++s, ++w) {
3166 Py_UCS4 ch = *s;
3167 if (ch > 0xFFFF) {
3168 assert(ch <= MAX_UNICODE);
3169 /* encode surrogate pair in this case */
3170 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3171 if (!size--)
3172 break;
3173 *w = Py_UNICODE_LOW_SURROGATE(ch);
3174 }
3175 else {
3176 *w = ch;
3177 }
3178 }
3179#endif
3180 }
3181}
3182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003183#ifdef HAVE_WCHAR_H
3184
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003185/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003186
Victor Stinnerd88d9832011-09-06 02:00:05 +02003187 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003188 character) required to convert the unicode object. Ignore size argument.
3189
Victor Stinnerd88d9832011-09-06 02:00:05 +02003190 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003191 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003192 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003193Py_ssize_t
3194PyUnicode_AsWideChar(PyObject *unicode,
3195 wchar_t *w,
3196 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003197{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003198 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003199
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003200 if (unicode == NULL) {
3201 PyErr_BadInternalCall();
3202 return -1;
3203 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003204 if (!PyUnicode_Check(unicode)) {
3205 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003206 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003207 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003208
3209 res = unicode_get_widechar_size(unicode);
3210 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003211 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003212 }
3213
3214 if (size > res) {
3215 size = res + 1;
3216 }
3217 else {
3218 res = size;
3219 }
3220 unicode_copy_as_widechar(unicode, w, size);
3221 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003222}
3223
Victor Stinner137c34c2010-09-29 10:25:54 +00003224wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003225PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003226 Py_ssize_t *size)
3227{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003228 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003229 Py_ssize_t buflen;
3230
3231 if (unicode == NULL) {
3232 PyErr_BadInternalCall();
3233 return NULL;
3234 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003235 if (!PyUnicode_Check(unicode)) {
3236 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003237 return NULL;
3238 }
3239
Serhiy Storchakac46db922018-10-23 22:58:24 +03003240 buflen = unicode_get_widechar_size(unicode);
3241 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003242 if (buffer == NULL) {
3243 PyErr_NoMemory();
3244 return NULL;
3245 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003246 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3247 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003248 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003249 }
3250 else if (wcslen(buffer) != (size_t)buflen) {
3251 PyMem_FREE(buffer);
3252 PyErr_SetString(PyExc_ValueError,
3253 "embedded null character");
3254 return NULL;
3255 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003256 return buffer;
3257}
3258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003259#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260
Alexander Belopolsky40018472011-02-26 01:02:56 +00003261PyObject *
3262PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003263{
Victor Stinner8faf8212011-12-08 22:14:11 +01003264 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003265 PyErr_SetString(PyExc_ValueError,
3266 "chr() arg not in range(0x110000)");
3267 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003268 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003269
Victor Stinner985a82a2014-01-03 12:53:47 +01003270 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003271}
3272
Alexander Belopolsky40018472011-02-26 01:02:56 +00003273PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003274PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003276 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003277 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003278 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003279 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003280 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 Py_INCREF(obj);
3282 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003283 }
3284 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 /* For a Unicode subtype that's not a Unicode object,
3286 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003287 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003288 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003289 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003290 "Can't convert '%.100s' object to str implicitly",
3291 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003292 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003293}
3294
Alexander Belopolsky40018472011-02-26 01:02:56 +00003295PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003296PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003297 const char *encoding,
3298 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003299{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003300 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003301 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003302
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003304 PyErr_BadInternalCall();
3305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003307
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003308 /* Decoding bytes objects is the most common case and should be fast */
3309 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003310 if (PyBytes_GET_SIZE(obj) == 0) {
3311 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3312 return NULL;
3313 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003314 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003315 }
3316 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003317 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3318 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003319 }
3320
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003321 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003322 PyErr_SetString(PyExc_TypeError,
3323 "decoding str is not supported");
3324 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003325 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003326
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003327 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3328 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3329 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003330 "decoding to str: need a bytes-like object, %.80s found",
3331 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003332 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003333 }
Tim Petersced69f82003-09-16 20:30:58 +00003334
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003335 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003336 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003337 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3338 return NULL;
3339 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003340 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003342
Serhiy Storchaka05997252013-01-26 12:14:02 +02003343 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003344 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003345 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346}
3347
Victor Stinnerebe17e02016-10-12 13:57:45 +02003348/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3349 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3350 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003351int
3352_Py_normalize_encoding(const char *encoding,
3353 char *lower,
3354 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003356 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003357 char *l;
3358 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003359 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360
Victor Stinner942889a2016-09-05 15:40:10 -07003361 assert(encoding != NULL);
3362
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003363 e = encoding;
3364 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003365 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003366 punct = 0;
3367 while (1) {
3368 char c = *e;
3369 if (c == 0) {
3370 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003371 }
Victor Stinner942889a2016-09-05 15:40:10 -07003372
3373 if (Py_ISALNUM(c) || c == '.') {
3374 if (punct && l != lower) {
3375 if (l == l_end) {
3376 return 0;
3377 }
3378 *l++ = '_';
3379 }
3380 punct = 0;
3381
3382 if (l == l_end) {
3383 return 0;
3384 }
3385 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003386 }
3387 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003388 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003389 }
Victor Stinner942889a2016-09-05 15:40:10 -07003390
3391 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003392 }
3393 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003394 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003395}
3396
Alexander Belopolsky40018472011-02-26 01:02:56 +00003397PyObject *
3398PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003399 Py_ssize_t size,
3400 const char *encoding,
3401 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003402{
3403 PyObject *buffer = NULL, *unicode;
3404 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003405 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3406
Victor Stinner22eb6892019-06-26 00:51:05 +02003407 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3408 return NULL;
3409 }
3410
Victor Stinnered076ed2019-06-26 01:49:32 +02003411 if (size == 0) {
3412 _Py_RETURN_UNICODE_EMPTY();
3413 }
3414
Victor Stinner942889a2016-09-05 15:40:10 -07003415 if (encoding == NULL) {
3416 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3417 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003418
Fred Drakee4315f52000-05-09 19:53:39 +00003419 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003420 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3421 char *lower = buflower;
3422
3423 /* Fast paths */
3424 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3425 lower += 3;
3426 if (*lower == '_') {
3427 /* Match "utf8" and "utf_8" */
3428 lower++;
3429 }
3430
3431 if (lower[0] == '8' && lower[1] == 0) {
3432 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3433 }
3434 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3435 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3436 }
3437 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3438 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3439 }
3440 }
3441 else {
3442 if (strcmp(lower, "ascii") == 0
3443 || strcmp(lower, "us_ascii") == 0) {
3444 return PyUnicode_DecodeASCII(s, size, errors);
3445 }
Steve Dowercc16be82016-09-08 10:35:16 -07003446 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003447 else if (strcmp(lower, "mbcs") == 0) {
3448 return PyUnicode_DecodeMBCS(s, size, errors);
3449 }
3450 #endif
3451 else if (strcmp(lower, "latin1") == 0
3452 || strcmp(lower, "latin_1") == 0
3453 || strcmp(lower, "iso_8859_1") == 0
3454 || strcmp(lower, "iso8859_1") == 0) {
3455 return PyUnicode_DecodeLatin1(s, size, errors);
3456 }
3457 }
Victor Stinner37296e82010-06-10 13:36:23 +00003458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459
3460 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003461 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003462 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003463 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003464 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 if (buffer == NULL)
3466 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003467 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 if (unicode == NULL)
3469 goto onError;
3470 if (!PyUnicode_Check(unicode)) {
3471 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003472 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003473 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003474 encoding,
3475 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 Py_DECREF(unicode);
3477 goto onError;
3478 }
3479 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003480 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003481
Benjamin Peterson29060642009-01-31 22:14:21 +00003482 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 Py_XDECREF(buffer);
3484 return NULL;
3485}
3486
Alexander Belopolsky40018472011-02-26 01:02:56 +00003487PyObject *
3488PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003489 const char *encoding,
3490 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003491{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003492 if (!PyUnicode_Check(unicode)) {
3493 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003494 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003495 }
3496
Serhiy Storchaka00939072016-10-27 21:05:49 +03003497 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3498 "PyUnicode_AsDecodedObject() is deprecated; "
3499 "use PyCodec_Decode() to decode from str", 1) < 0)
3500 return NULL;
3501
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003502 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003503 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003504
3505 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003506 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003507}
3508
Alexander Belopolsky40018472011-02-26 01:02:56 +00003509PyObject *
3510PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003511 const char *encoding,
3512 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003513{
3514 PyObject *v;
3515
3516 if (!PyUnicode_Check(unicode)) {
3517 PyErr_BadArgument();
3518 goto onError;
3519 }
3520
Serhiy Storchaka00939072016-10-27 21:05:49 +03003521 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3522 "PyUnicode_AsDecodedUnicode() is deprecated; "
3523 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3524 return NULL;
3525
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003526 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003527 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003528
3529 /* Decode via the codec registry */
3530 v = PyCodec_Decode(unicode, encoding, errors);
3531 if (v == NULL)
3532 goto onError;
3533 if (!PyUnicode_Check(v)) {
3534 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003535 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003536 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003537 encoding,
3538 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003539 Py_DECREF(v);
3540 goto onError;
3541 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003542 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003543
Benjamin Peterson29060642009-01-31 22:14:21 +00003544 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003545 return NULL;
3546}
3547
Alexander Belopolsky40018472011-02-26 01:02:56 +00003548PyObject *
3549PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003550 Py_ssize_t size,
3551 const char *encoding,
3552 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553{
3554 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003555
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003556 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3560 Py_DECREF(unicode);
3561 return v;
3562}
3563
Alexander Belopolsky40018472011-02-26 01:02:56 +00003564PyObject *
3565PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003566 const char *encoding,
3567 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003568{
3569 PyObject *v;
3570
3571 if (!PyUnicode_Check(unicode)) {
3572 PyErr_BadArgument();
3573 goto onError;
3574 }
3575
Serhiy Storchaka00939072016-10-27 21:05:49 +03003576 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3577 "PyUnicode_AsEncodedObject() is deprecated; "
3578 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3579 "or PyCodec_Encode() for generic encoding", 1) < 0)
3580 return NULL;
3581
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003582 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003583 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003584
3585 /* Encode via the codec registry */
3586 v = PyCodec_Encode(unicode, encoding, errors);
3587 if (v == NULL)
3588 goto onError;
3589 return v;
3590
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003592 return NULL;
3593}
3594
Victor Stinner1b579672011-12-17 05:47:23 +01003595
Victor Stinner2cba6b82018-01-10 22:46:15 +01003596static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003597unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003598 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003599{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003600 Py_ssize_t wlen;
3601 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3602 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003603 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003604 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003605
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003606 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003607 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003608 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003609 return NULL;
3610 }
3611
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003612 char *str;
3613 size_t error_pos;
3614 const char *reason;
3615 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003616 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003617 PyMem_Free(wstr);
3618
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003619 if (res != 0) {
3620 if (res == -2) {
3621 PyObject *exc;
3622 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3623 "locale", unicode,
3624 (Py_ssize_t)error_pos,
3625 (Py_ssize_t)(error_pos+1),
3626 reason);
3627 if (exc != NULL) {
3628 PyCodec_StrictErrors(exc);
3629 Py_DECREF(exc);
3630 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003631 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003632 else if (res == -3) {
3633 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3634 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003635 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003636 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003637 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003638 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003639 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003640
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003641 PyObject *bytes = PyBytes_FromString(str);
3642 PyMem_RawFree(str);
3643 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003644}
3645
Victor Stinnerad158722010-10-27 00:25:46 +00003646PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003647PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3648{
Victor Stinner709d23d2019-05-02 14:56:30 -04003649 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3650 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003651}
3652
3653PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003654PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003655{
Victor Stinner81a7be32020-04-14 15:14:01 +02003656 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003657 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3658 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003659 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003660 fs_codec->error_handler,
3661 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003662 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003663#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003664 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003665 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003666 fs_codec->encoding,
3667 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003668 }
Victor Stinnerad158722010-10-27 00:25:46 +00003669#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003670 else {
3671 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3672 machinery is not ready and so cannot be used:
3673 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003674 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3675 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003676 assert(filesystem_errors != NULL);
3677 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3678 assert(errors != _Py_ERROR_UNKNOWN);
3679#ifdef _Py_FORCE_UTF8_FS_ENCODING
3680 return unicode_encode_utf8(unicode, errors, NULL);
3681#else
3682 return unicode_encode_locale(unicode, errors, 0);
3683#endif
3684 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003685}
3686
Alexander Belopolsky40018472011-02-26 01:02:56 +00003687PyObject *
3688PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003689 const char *encoding,
3690 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691{
3692 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003693 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003694
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 if (!PyUnicode_Check(unicode)) {
3696 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003697 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 }
Fred Drakee4315f52000-05-09 19:53:39 +00003699
Victor Stinner22eb6892019-06-26 00:51:05 +02003700 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3701 return NULL;
3702 }
3703
Victor Stinner942889a2016-09-05 15:40:10 -07003704 if (encoding == NULL) {
3705 return _PyUnicode_AsUTF8String(unicode, errors);
3706 }
3707
Fred Drakee4315f52000-05-09 19:53:39 +00003708 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003709 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3710 char *lower = buflower;
3711
3712 /* Fast paths */
3713 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3714 lower += 3;
3715 if (*lower == '_') {
3716 /* Match "utf8" and "utf_8" */
3717 lower++;
3718 }
3719
3720 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003722 }
3723 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3724 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3725 }
3726 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3727 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3728 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003729 }
Victor Stinner942889a2016-09-05 15:40:10 -07003730 else {
3731 if (strcmp(lower, "ascii") == 0
3732 || strcmp(lower, "us_ascii") == 0) {
3733 return _PyUnicode_AsASCIIString(unicode, errors);
3734 }
Steve Dowercc16be82016-09-08 10:35:16 -07003735#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003736 else if (strcmp(lower, "mbcs") == 0) {
3737 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3738 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003739#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003740 else if (strcmp(lower, "latin1") == 0 ||
3741 strcmp(lower, "latin_1") == 0 ||
3742 strcmp(lower, "iso_8859_1") == 0 ||
3743 strcmp(lower, "iso8859_1") == 0) {
3744 return _PyUnicode_AsLatin1String(unicode, errors);
3745 }
3746 }
Victor Stinner37296e82010-06-10 13:36:23 +00003747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748
3749 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003750 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003752 return NULL;
3753
3754 /* The normal path */
3755 if (PyBytes_Check(v))
3756 return v;
3757
3758 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003759 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003760 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003761 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003762
3763 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003764 "encoder %s returned bytearray instead of bytes; "
3765 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003766 encoding);
3767 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003768 Py_DECREF(v);
3769 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003770 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003771
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003772 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3773 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003774 Py_DECREF(v);
3775 return b;
3776 }
3777
3778 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003779 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003780 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003781 encoding,
3782 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003783 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003784 return NULL;
3785}
3786
Alexander Belopolsky40018472011-02-26 01:02:56 +00003787PyObject *
3788PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003789 const char *encoding,
3790 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003791{
3792 PyObject *v;
3793
3794 if (!PyUnicode_Check(unicode)) {
3795 PyErr_BadArgument();
3796 goto onError;
3797 }
3798
Serhiy Storchaka00939072016-10-27 21:05:49 +03003799 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3800 "PyUnicode_AsEncodedUnicode() is deprecated; "
3801 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3802 return NULL;
3803
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003804 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003805 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003806
3807 /* Encode via the codec registry */
3808 v = PyCodec_Encode(unicode, encoding, errors);
3809 if (v == NULL)
3810 goto onError;
3811 if (!PyUnicode_Check(v)) {
3812 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003813 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003814 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003815 encoding,
3816 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003817 Py_DECREF(v);
3818 goto onError;
3819 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003821
Benjamin Peterson29060642009-01-31 22:14:21 +00003822 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 return NULL;
3824}
3825
Victor Stinner2cba6b82018-01-10 22:46:15 +01003826static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003827unicode_decode_locale(const char *str, Py_ssize_t len,
3828 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003829{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003830 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3831 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003832 return NULL;
3833 }
3834
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003835 wchar_t *wstr;
3836 size_t wlen;
3837 const char *reason;
3838 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003839 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003840 if (res != 0) {
3841 if (res == -2) {
3842 PyObject *exc;
3843 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3844 "locale", str, len,
3845 (Py_ssize_t)wlen,
3846 (Py_ssize_t)(wlen + 1),
3847 reason);
3848 if (exc != NULL) {
3849 PyCodec_StrictErrors(exc);
3850 Py_DECREF(exc);
3851 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003852 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003853 else if (res == -3) {
3854 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3855 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003856 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003857 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003858 }
Victor Stinner2f197072011-12-17 07:08:30 +01003859 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003860 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003861
3862 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3863 PyMem_RawFree(wstr);
3864 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003865}
3866
3867PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003868PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3869 const char *errors)
3870{
Victor Stinner709d23d2019-05-02 14:56:30 -04003871 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3872 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003873}
3874
3875PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003876PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003877{
3878 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003879 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3880 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003881}
3882
3883
3884PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003885PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003886 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003887 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3888}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003889
Christian Heimes5894ba72007-11-04 11:43:14 +00003890PyObject*
3891PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3892{
Victor Stinner81a7be32020-04-14 15:14:01 +02003893 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003894 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3895 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003896 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003897 fs_codec->error_handler,
3898 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04003899 NULL);
3900 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003901#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003902 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003903 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003904 fs_codec->encoding,
3905 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003906 }
Victor Stinnerad158722010-10-27 00:25:46 +00003907#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003908 else {
3909 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3910 machinery is not ready and so cannot be used:
3911 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003912 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3913 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003914 assert(filesystem_errors != NULL);
3915 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3916 assert(errors != _Py_ERROR_UNKNOWN);
3917#ifdef _Py_FORCE_UTF8_FS_ENCODING
3918 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3919#else
3920 return unicode_decode_locale(s, size, errors, 0);
3921#endif
3922 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003923}
3924
Martin v. Löwis011e8422009-05-05 04:43:17 +00003925
3926int
3927PyUnicode_FSConverter(PyObject* arg, void* addr)
3928{
Brett Cannonec6ce872016-09-06 15:50:29 -07003929 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003930 PyObject *output = NULL;
3931 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003932 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003933 if (arg == NULL) {
3934 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003935 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003936 return 1;
3937 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003938 path = PyOS_FSPath(arg);
3939 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003940 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003941 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003942 if (PyBytes_Check(path)) {
3943 output = path;
3944 }
3945 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3946 output = PyUnicode_EncodeFSDefault(path);
3947 Py_DECREF(path);
3948 if (!output) {
3949 return 0;
3950 }
3951 assert(PyBytes_Check(output));
3952 }
3953
Victor Stinner0ea2a462010-04-30 00:22:08 +00003954 size = PyBytes_GET_SIZE(output);
3955 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003956 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003957 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003958 Py_DECREF(output);
3959 return 0;
3960 }
3961 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003962 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003963}
3964
3965
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003966int
3967PyUnicode_FSDecoder(PyObject* arg, void* addr)
3968{
Brett Cannona5711202016-09-06 19:36:01 -07003969 int is_buffer = 0;
3970 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003971 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003972 if (arg == NULL) {
3973 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003974 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003975 return 1;
3976 }
Brett Cannona5711202016-09-06 19:36:01 -07003977
3978 is_buffer = PyObject_CheckBuffer(arg);
3979 if (!is_buffer) {
3980 path = PyOS_FSPath(arg);
3981 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003982 return 0;
3983 }
Brett Cannona5711202016-09-06 19:36:01 -07003984 }
3985 else {
3986 path = arg;
3987 Py_INCREF(arg);
3988 }
3989
3990 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003991 output = path;
3992 }
3993 else if (PyBytes_Check(path) || is_buffer) {
3994 PyObject *path_bytes = NULL;
3995
3996 if (!PyBytes_Check(path) &&
3997 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003998 "path should be string, bytes, or os.PathLike, not %.200s",
3999 Py_TYPE(arg)->tp_name)) {
4000 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004001 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004002 }
4003 path_bytes = PyBytes_FromObject(path);
4004 Py_DECREF(path);
4005 if (!path_bytes) {
4006 return 0;
4007 }
4008 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4009 PyBytes_GET_SIZE(path_bytes));
4010 Py_DECREF(path_bytes);
4011 if (!output) {
4012 return 0;
4013 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004014 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004015 else {
4016 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004017 "path should be string, bytes, or os.PathLike, not %.200s",
4018 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004019 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004020 return 0;
4021 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004022 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004023 Py_DECREF(output);
4024 return 0;
4025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004026 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004027 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004028 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004029 Py_DECREF(output);
4030 return 0;
4031 }
4032 *(PyObject**)addr = output;
4033 return Py_CLEANUP_SUPPORTED;
4034}
4035
4036
Inada Naoki02a4d572020-02-27 13:48:59 +09004037static int unicode_fill_utf8(PyObject *unicode);
4038
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004039const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004041{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004042 if (!PyUnicode_Check(unicode)) {
4043 PyErr_BadArgument();
4044 return NULL;
4045 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004046 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004047 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004049 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004050 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 return NULL;
4052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 }
4054
4055 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004056 *psize = PyUnicode_UTF8_LENGTH(unicode);
4057 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004058}
4059
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004060const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4064}
4065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066Py_UNICODE *
4067PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4068{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 if (!PyUnicode_Check(unicode)) {
4070 PyErr_BadArgument();
4071 return NULL;
4072 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004073 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4074 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004075 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004076 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004077 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078
Serhiy Storchakac46db922018-10-23 22:58:24 +03004079 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4080 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4081 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004084 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4085 if (w == NULL) {
4086 PyErr_NoMemory();
4087 return NULL;
4088 }
4089 unicode_copy_as_widechar(unicode, w, wlen + 1);
4090 _PyUnicode_WSTR(unicode) = w;
4091 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4092 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 }
4094 }
4095 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004096 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004097 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004098}
4099
Inada Naoki610a60c2020-06-18 17:30:53 +09004100/* Deprecated APIs */
4101
4102_Py_COMP_DIAG_PUSH
4103_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4104
Alexander Belopolsky40018472011-02-26 01:02:56 +00004105Py_UNICODE *
4106PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109}
4110
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004111const Py_UNICODE *
4112_PyUnicode_AsUnicode(PyObject *unicode)
4113{
4114 Py_ssize_t size;
4115 const Py_UNICODE *wstr;
4116
4117 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4118 if (wstr && wcslen(wstr) != (size_t)size) {
4119 PyErr_SetString(PyExc_ValueError, "embedded null character");
4120 return NULL;
4121 }
4122 return wstr;
4123}
4124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004125
Alexander Belopolsky40018472011-02-26 01:02:56 +00004126Py_ssize_t
4127PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128{
4129 if (!PyUnicode_Check(unicode)) {
4130 PyErr_BadArgument();
4131 goto onError;
4132 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004133 if (_PyUnicode_WSTR(unicode) == NULL) {
4134 if (PyUnicode_AsUnicode(unicode) == NULL)
4135 goto onError;
4136 }
4137 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 return -1;
4141}
4142
Inada Naoki610a60c2020-06-18 17:30:53 +09004143_Py_COMP_DIAG_POP
4144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004145Py_ssize_t
4146PyUnicode_GetLength(PyObject *unicode)
4147{
Victor Stinner07621332012-06-16 04:53:46 +02004148 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004149 PyErr_BadArgument();
4150 return -1;
4151 }
Victor Stinner07621332012-06-16 04:53:46 +02004152 if (PyUnicode_READY(unicode) == -1)
4153 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154 return PyUnicode_GET_LENGTH(unicode);
4155}
4156
4157Py_UCS4
4158PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4159{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004160 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004161 int kind;
4162
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004163 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004164 PyErr_BadArgument();
4165 return (Py_UCS4)-1;
4166 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004167 if (PyUnicode_READY(unicode) == -1) {
4168 return (Py_UCS4)-1;
4169 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004170 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004171 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172 return (Py_UCS4)-1;
4173 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004174 data = PyUnicode_DATA(unicode);
4175 kind = PyUnicode_KIND(unicode);
4176 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004177}
4178
4179int
4180PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4181{
4182 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004183 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004184 return -1;
4185 }
Victor Stinner488fa492011-12-12 00:01:39 +01004186 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004187 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004188 PyErr_SetString(PyExc_IndexError, "string index out of range");
4189 return -1;
4190 }
Victor Stinner488fa492011-12-12 00:01:39 +01004191 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004192 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004193 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4194 PyErr_SetString(PyExc_ValueError, "character out of range");
4195 return -1;
4196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4198 index, ch);
4199 return 0;
4200}
4201
Alexander Belopolsky40018472011-02-26 01:02:56 +00004202const char *
4203PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004204{
Victor Stinner42cb4622010-09-01 19:39:01 +00004205 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004206}
4207
Victor Stinner554f3f02010-06-16 23:33:54 +00004208/* create or adjust a UnicodeDecodeError */
4209static void
4210make_decode_exception(PyObject **exceptionObject,
4211 const char *encoding,
4212 const char *input, Py_ssize_t length,
4213 Py_ssize_t startpos, Py_ssize_t endpos,
4214 const char *reason)
4215{
4216 if (*exceptionObject == NULL) {
4217 *exceptionObject = PyUnicodeDecodeError_Create(
4218 encoding, input, length, startpos, endpos, reason);
4219 }
4220 else {
4221 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4222 goto onError;
4223 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4224 goto onError;
4225 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4226 goto onError;
4227 }
4228 return;
4229
4230onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004231 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004232}
4233
Steve Dowercc16be82016-09-08 10:35:16 -07004234#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004235static int
4236widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4237{
4238 if (newsize > *size) {
4239 wchar_t *newbuf = *buf;
4240 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4241 PyErr_NoMemory();
4242 return -1;
4243 }
4244 *buf = newbuf;
4245 }
4246 *size = newsize;
4247 return 0;
4248}
4249
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250/* error handling callback helper:
4251 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004252 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253 and adjust various state variables.
4254 return 0 on success, -1 on error
4255*/
4256
Alexander Belopolsky40018472011-02-26 01:02:56 +00004257static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004258unicode_decode_call_errorhandler_wchar(
4259 const char *errors, PyObject **errorHandler,
4260 const char *encoding, const char *reason,
4261 const char **input, const char **inend, Py_ssize_t *startinpos,
4262 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004263 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004265 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004266
4267 PyObject *restuple = NULL;
4268 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004269 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004270 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004271 Py_ssize_t requiredsize;
4272 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004273 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004274 wchar_t *repwstr;
4275 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276
4277 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004278 *errorHandler = PyCodec_LookupError(errors);
4279 if (*errorHandler == NULL)
4280 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281 }
4282
Victor Stinner554f3f02010-06-16 23:33:54 +00004283 make_decode_exception(exceptionObject,
4284 encoding,
4285 *input, *inend - *input,
4286 *startinpos, *endinpos,
4287 reason);
4288 if (*exceptionObject == NULL)
4289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290
Petr Viktorinffd97532020-02-11 17:46:57 +01004291 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004295 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004298 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004300
4301 /* Copy back the bytes variables, which might have been modified by the
4302 callback */
4303 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4304 if (!inputobj)
4305 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 *input = PyBytes_AS_STRING(inputobj);
4307 insize = PyBytes_GET_SIZE(inputobj);
4308 *inend = *input + insize;
4309 /* we can DECREF safely, as the exception has another reference,
4310 so the object won't go away. */
4311 Py_DECREF(inputobj);
4312
4313 if (newpos<0)
4314 newpos = insize+newpos;
4315 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004316 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004317 goto onError;
4318 }
4319
4320 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4321 if (repwstr == NULL)
4322 goto onError;
4323 /* need more space? (at least enough for what we
4324 have+the replacement+the rest of the string (starting
4325 at the new input position), so we won't have to check space
4326 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004327 requiredsize = *outpos;
4328 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4329 goto overflow;
4330 requiredsize += repwlen;
4331 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4332 goto overflow;
4333 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004334 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004335 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004336 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004338 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004339 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004340 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004341 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004342 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 *endinpos = newpos;
4345 *inptr = *input + newpos;
4346
4347 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004348 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004349 return 0;
4350
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004351 overflow:
4352 PyErr_SetString(PyExc_OverflowError,
4353 "decoded result is too long for a Python string");
4354
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004355 onError:
4356 Py_XDECREF(restuple);
4357 return -1;
4358}
Steve Dowercc16be82016-09-08 10:35:16 -07004359#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004360
4361static int
4362unicode_decode_call_errorhandler_writer(
4363 const char *errors, PyObject **errorHandler,
4364 const char *encoding, const char *reason,
4365 const char **input, const char **inend, Py_ssize_t *startinpos,
4366 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4367 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4368{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004369 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004370
4371 PyObject *restuple = NULL;
4372 PyObject *repunicode = NULL;
4373 Py_ssize_t insize;
4374 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004375 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004376 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004377 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004378 int need_to_grow = 0;
4379 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004380
4381 if (*errorHandler == NULL) {
4382 *errorHandler = PyCodec_LookupError(errors);
4383 if (*errorHandler == NULL)
4384 goto onError;
4385 }
4386
4387 make_decode_exception(exceptionObject,
4388 encoding,
4389 *input, *inend - *input,
4390 *startinpos, *endinpos,
4391 reason);
4392 if (*exceptionObject == NULL)
4393 goto onError;
4394
Petr Viktorinffd97532020-02-11 17:46:57 +01004395 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004396 if (restuple == NULL)
4397 goto onError;
4398 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004399 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004400 goto onError;
4401 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004402 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004403 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004404
4405 /* Copy back the bytes variables, which might have been modified by the
4406 callback */
4407 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4408 if (!inputobj)
4409 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004410 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004411 *input = PyBytes_AS_STRING(inputobj);
4412 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004413 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004414 /* we can DECREF safely, as the exception has another reference,
4415 so the object won't go away. */
4416 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004420 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004421 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004423 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424
Victor Stinner170ca6f2013-04-18 00:25:28 +02004425 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004426 if (replen > 1) {
4427 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004428 need_to_grow = 1;
4429 }
4430 new_inptr = *input + newpos;
4431 if (*inend - new_inptr > remain) {
4432 /* We don't know the decoding algorithm here so we make the worst
4433 assumption that one byte decodes to one unicode character.
4434 If unfortunately one byte could decode to more unicode characters,
4435 the decoder may write out-of-bound then. Is it possible for the
4436 algorithms using this function? */
4437 writer->min_length += *inend - new_inptr - remain;
4438 need_to_grow = 1;
4439 }
4440 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004441 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004442 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004443 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4444 goto onError;
4445 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004446 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004447 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004450 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004451
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004453 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004454 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004458 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459}
4460
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004461/* --- UTF-7 Codec -------------------------------------------------------- */
4462
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463/* See RFC2152 for details. We encode conservatively and decode liberally. */
4464
4465/* Three simple macros defining base-64. */
4466
4467/* Is c a base-64 character? */
4468
4469#define IS_BASE64(c) \
4470 (((c) >= 'A' && (c) <= 'Z') || \
4471 ((c) >= 'a' && (c) <= 'z') || \
4472 ((c) >= '0' && (c) <= '9') || \
4473 (c) == '+' || (c) == '/')
4474
4475/* given that c is a base-64 character, what is its base-64 value? */
4476
4477#define FROM_BASE64(c) \
4478 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4479 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4480 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4481 (c) == '+' ? 62 : 63)
4482
4483/* What is the base-64 character of the bottom 6 bits of n? */
4484
4485#define TO_BASE64(n) \
4486 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4487
4488/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4489 * decoded as itself. We are permissive on decoding; the only ASCII
4490 * byte not decoding to itself is the + which begins a base64
4491 * string. */
4492
4493#define DECODE_DIRECT(c) \
4494 ((c) <= 127 && (c) != '+')
4495
4496/* The UTF-7 encoder treats ASCII characters differently according to
4497 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4498 * the above). See RFC2152. This array identifies these different
4499 * sets:
4500 * 0 : "Set D"
4501 * alphanumeric and '(),-./:?
4502 * 1 : "Set O"
4503 * !"#$%&*;<=>@[]^_`{|}
4504 * 2 : "whitespace"
4505 * ht nl cr sp
4506 * 3 : special (must be base64 encoded)
4507 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4508 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004509
Tim Petersced69f82003-09-16 20:30:58 +00004510static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511char utf7_category[128] = {
4512/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4513 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4514/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4515 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4516/* sp ! " # $ % & ' ( ) * + , - . / */
4517 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4518/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4520/* @ A B C D E F G H I J K L M N O */
4521 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4522/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4524/* ` a b c d e f g h i j k l m n o */
4525 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4526/* p q r s t u v w x y z { | } ~ del */
4527 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528};
4529
Antoine Pitrou244651a2009-05-04 18:56:13 +00004530/* ENCODE_DIRECT: this character should be encoded as itself. The
4531 * answer depends on whether we are encoding set O as itself, and also
4532 * on whether we are encoding whitespace as itself. RFC2152 makes it
4533 * clear that the answers to these questions vary between
4534 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004535
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536#define ENCODE_DIRECT(c, directO, directWS) \
4537 ((c) < 128 && (c) > 0 && \
4538 ((utf7_category[(c)] == 0) || \
4539 (directWS && (utf7_category[(c)] == 2)) || \
4540 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541
Alexander Belopolsky40018472011-02-26 01:02:56 +00004542PyObject *
4543PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004544 Py_ssize_t size,
4545 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004547 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4548}
4549
Antoine Pitrou244651a2009-05-04 18:56:13 +00004550/* The decoder. The only state we preserve is our read position,
4551 * i.e. how many characters we have consumed. So if we end in the
4552 * middle of a shift sequence we have to back off the read position
4553 * and the output to the beginning of the sequence, otherwise we lose
4554 * all the shift state (seen bits, number of bits seen, high
4555 * surrogate). */
4556
Alexander Belopolsky40018472011-02-26 01:02:56 +00004557PyObject *
4558PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004559 Py_ssize_t size,
4560 const char *errors,
4561 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004562{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004564 Py_ssize_t startinpos;
4565 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004567 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004568 const char *errmsg = "";
4569 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004570 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 unsigned int base64bits = 0;
4572 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004573 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 PyObject *errorHandler = NULL;
4575 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004577 if (size == 0) {
4578 if (consumed)
4579 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004580 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004581 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004583 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004584 _PyUnicodeWriter_Init(&writer);
4585 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004586
4587 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588 e = s + size;
4589
4590 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004591 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004592 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004593 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 if (inShift) { /* in a base-64 section */
4596 if (IS_BASE64(ch)) { /* consume a base-64 character */
4597 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4598 base64bits += 6;
4599 s++;
4600 if (base64bits >= 16) {
4601 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004602 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 base64bits -= 16;
4604 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004605 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 if (surrogate) {
4607 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004608 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4609 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004610 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004611 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004613 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 }
4615 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004616 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004617 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 }
4620 }
Victor Stinner551ac952011-11-29 22:58:13 +01004621 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 /* first surrogate */
4623 surrogate = outCh;
4624 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004626 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004627 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 }
4629 }
4630 }
4631 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 if (base64bits > 0) { /* left-over bits */
4634 if (base64bits >= 6) {
4635 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004636 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004637 errmsg = "partial character in shift sequence";
4638 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004639 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004640 else {
4641 /* Some bits remain; they should be zero */
4642 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004643 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644 errmsg = "non-zero padding bits in shift sequence";
4645 goto utf7Error;
4646 }
4647 }
4648 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004649 if (surrogate && DECODE_DIRECT(ch)) {
4650 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4651 goto onError;
4652 }
4653 surrogate = 0;
4654 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 /* '-' is absorbed; other terminating
4656 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004657 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004658 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659 }
4660 }
4661 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 s++; /* consume '+' */
4664 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004665 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004666 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004667 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004669 else if (s < e && !IS_BASE64(*s)) {
4670 s++;
4671 errmsg = "ill-formed sequence";
4672 goto utf7Error;
4673 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004676 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004677 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004679 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004680 }
4681 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004683 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004684 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004685 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004686 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 else {
4688 startinpos = s-starts;
4689 s++;
4690 errmsg = "unexpected special character";
4691 goto utf7Error;
4692 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004693 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004694utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004696 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 errors, &errorHandler,
4698 "utf7", errmsg,
4699 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004700 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004702 }
4703
Antoine Pitrou244651a2009-05-04 18:56:13 +00004704 /* end of string */
4705
4706 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4707 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004708 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004709 if (surrogate ||
4710 (base64bits >= 6) ||
4711 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004713 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004714 errors, &errorHandler,
4715 "utf7", "unterminated shift sequence",
4716 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004717 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004718 goto onError;
4719 if (s < e)
4720 goto restart;
4721 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004723
4724 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004725 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004727 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004728 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004729 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004730 writer.kind, writer.data, shiftOutStart);
4731 Py_XDECREF(errorHandler);
4732 Py_XDECREF(exc);
4733 _PyUnicodeWriter_Dealloc(&writer);
4734 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004735 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004736 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004737 }
4738 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004739 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004740 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004741 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 Py_XDECREF(errorHandler);
4744 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004745 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004746
Benjamin Peterson29060642009-01-31 22:14:21 +00004747 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 Py_XDECREF(errorHandler);
4749 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004750 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751 return NULL;
4752}
4753
4754
Alexander Belopolsky40018472011-02-26 01:02:56 +00004755PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004756_PyUnicode_EncodeUTF7(PyObject *str,
4757 int base64SetO,
4758 int base64WhiteSpace,
4759 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004760{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004761 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004762 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004763 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004764 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004766 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004767 unsigned int base64bits = 0;
4768 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004769 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004770 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004771
Benjamin Petersonbac79492012-01-14 13:34:47 -05004772 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004773 return NULL;
4774 kind = PyUnicode_KIND(str);
4775 data = PyUnicode_DATA(str);
4776 len = PyUnicode_GET_LENGTH(str);
4777
4778 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004780
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004781 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004782 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004783 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004784 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004785 if (v == NULL)
4786 return NULL;
4787
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004788 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004789 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004790 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004791
Antoine Pitrou244651a2009-05-04 18:56:13 +00004792 if (inShift) {
4793 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4794 /* shifting out */
4795 if (base64bits) { /* output remaining bits */
4796 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4797 base64buffer = 0;
4798 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004799 }
4800 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 /* Characters not in the BASE64 set implicitly unshift the sequence
4802 so no '-' is required, except if the character is itself a '-' */
4803 if (IS_BASE64(ch) || ch == '-') {
4804 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004805 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004806 *out++ = (char) ch;
4807 }
4808 else {
4809 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004810 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004811 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004812 else { /* not in a shift sequence */
4813 if (ch == '+') {
4814 *out++ = '+';
4815 *out++ = '-';
4816 }
4817 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4818 *out++ = (char) ch;
4819 }
4820 else {
4821 *out++ = '+';
4822 inShift = 1;
4823 goto encode_char;
4824 }
4825 }
4826 continue;
4827encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004828 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004829 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004830
Antoine Pitrou244651a2009-05-04 18:56:13 +00004831 /* code first surrogate */
4832 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004833 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004834 while (base64bits >= 6) {
4835 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4836 base64bits -= 6;
4837 }
4838 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004839 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004840 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004841 base64bits += 16;
4842 base64buffer = (base64buffer << 16) | ch;
4843 while (base64bits >= 6) {
4844 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4845 base64bits -= 6;
4846 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004847 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004848 if (base64bits)
4849 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4850 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004851 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004852 if (_PyBytes_Resize(&v, out - start) < 0)
4853 return NULL;
4854 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004855}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004856PyObject *
4857PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4858 Py_ssize_t size,
4859 int base64SetO,
4860 int base64WhiteSpace,
4861 const char *errors)
4862{
4863 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004864 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004865 if (tmp == NULL)
4866 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004867 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004868 base64WhiteSpace, errors);
4869 Py_DECREF(tmp);
4870 return result;
4871}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004872
Antoine Pitrou244651a2009-05-04 18:56:13 +00004873#undef IS_BASE64
4874#undef FROM_BASE64
4875#undef TO_BASE64
4876#undef DECODE_DIRECT
4877#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004878
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879/* --- UTF-8 Codec -------------------------------------------------------- */
4880
Alexander Belopolsky40018472011-02-26 01:02:56 +00004881PyObject *
4882PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004883 Py_ssize_t size,
4884 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885{
Walter Dörwald69652032004-09-07 20:24:22 +00004886 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4887}
4888
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889#include "stringlib/asciilib.h"
4890#include "stringlib/codecs.h"
4891#include "stringlib/undef.h"
4892
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004893#include "stringlib/ucs1lib.h"
4894#include "stringlib/codecs.h"
4895#include "stringlib/undef.h"
4896
4897#include "stringlib/ucs2lib.h"
4898#include "stringlib/codecs.h"
4899#include "stringlib/undef.h"
4900
4901#include "stringlib/ucs4lib.h"
4902#include "stringlib/codecs.h"
4903#include "stringlib/undef.h"
4904
Antoine Pitrouab868312009-01-10 15:40:25 +00004905/* Mask to quickly check whether a C 'long' contains a
4906 non-ASCII, UTF8-encoded char. */
4907#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004908# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004909#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004910# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004911#else
4912# error C 'long' size should be either 4 or 8!
4913#endif
4914
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004915static Py_ssize_t
4916ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004917{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004919 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004920
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004921 /*
4922 * Issue #17237: m68k is a bit different from most architectures in
4923 * that objects do not use "natural alignment" - for example, int and
4924 * long are only aligned at 2-byte boundaries. Therefore the assert()
4925 * won't work; also, tests have shown that skipping the "optimised
4926 * version" will even speed up m68k.
4927 */
4928#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004929#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004930 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4931 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932 /* Fast path, see in STRINGLIB(utf8_decode) for
4933 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004934 /* Help allocation */
4935 const char *_p = p;
4936 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937 while (_p < aligned_end) {
4938 unsigned long value = *(const unsigned long *) _p;
4939 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 *((unsigned long *)q) = value;
4942 _p += SIZEOF_LONG;
4943 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004944 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 p = _p;
4946 while (p < end) {
4947 if ((unsigned char)*p & 0x80)
4948 break;
4949 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004951 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004953#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004954#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 while (p < end) {
4956 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4957 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004958 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004959 /* Help allocation */
4960 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004962 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004963 if (value & ASCII_CHAR_MASK)
4964 break;
4965 _p += SIZEOF_LONG;
4966 }
4967 p = _p;
4968 if (_p == end)
4969 break;
4970 }
4971 if ((unsigned char)*p & 0x80)
4972 break;
4973 ++p;
4974 }
4975 memcpy(dest, start, p - start);
4976 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977}
Antoine Pitrouab868312009-01-10 15:40:25 +00004978
Victor Stinner709d23d2019-05-02 14:56:30 -04004979static PyObject *
4980unicode_decode_utf8(const char *s, Py_ssize_t size,
4981 _Py_error_handler error_handler, const char *errors,
4982 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004983{
Victor Stinner785938e2011-12-11 20:09:03 +01004984 if (size == 0) {
4985 if (consumed)
4986 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004987 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004988 }
4989
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4991 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004992 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 *consumed = 1;
4994 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004995 }
4996
Inada Naoki770847a2019-06-24 12:30:24 +09004997 const char *starts = s;
4998 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004999
Inada Naoki770847a2019-06-24 12:30:24 +09005000 // fast path: try ASCII string.
5001 PyObject *u = PyUnicode_New(size, 127);
5002 if (u == NULL) {
5003 return NULL;
5004 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005005 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005006 if (s == end) {
5007 return u;
5008 }
5009
5010 // Use _PyUnicodeWriter after fast path is failed.
5011 _PyUnicodeWriter writer;
5012 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5013 writer.pos = s - starts;
5014
5015 Py_ssize_t startinpos, endinpos;
5016 const char *errmsg = "";
5017 PyObject *error_handler_obj = NULL;
5018 PyObject *exc = NULL;
5019
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005020 while (s < end) {
5021 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005022 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005023
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005025 if (PyUnicode_IS_ASCII(writer.buffer))
5026 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005027 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005028 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005029 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005030 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005031 } else {
5032 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005033 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005034 }
5035
5036 switch (ch) {
5037 case 0:
5038 if (s == end || consumed)
5039 goto End;
5040 errmsg = "unexpected end of data";
5041 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005042 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043 break;
5044 case 1:
5045 errmsg = "invalid start byte";
5046 startinpos = s - starts;
5047 endinpos = startinpos + 1;
5048 break;
5049 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005050 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5051 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5052 {
5053 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005054 goto End;
5055 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005056 /* fall through */
5057 case 3:
5058 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059 errmsg = "invalid continuation byte";
5060 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005061 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005062 break;
5063 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005064 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005065 goto onError;
5066 continue;
5067 }
5068
Victor Stinner1d65d912015-10-05 13:43:50 +02005069 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005070 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005071
5072 switch (error_handler) {
5073 case _Py_ERROR_IGNORE:
5074 s += (endinpos - startinpos);
5075 break;
5076
5077 case _Py_ERROR_REPLACE:
5078 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5079 goto onError;
5080 s += (endinpos - startinpos);
5081 break;
5082
5083 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005084 {
5085 Py_ssize_t i;
5086
Victor Stinner1d65d912015-10-05 13:43:50 +02005087 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5088 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005089 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005090 ch = (Py_UCS4)(unsigned char)(starts[i]);
5091 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5092 ch + 0xdc00);
5093 writer.pos++;
5094 }
5095 s += (endinpos - startinpos);
5096 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005097 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005098
5099 default:
5100 if (unicode_decode_call_errorhandler_writer(
5101 errors, &error_handler_obj,
5102 "utf-8", errmsg,
5103 &starts, &end, &startinpos, &endinpos, &exc, &s,
5104 &writer))
5105 goto onError;
5106 }
Victor Stinner785938e2011-12-11 20:09:03 +01005107 }
5108
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005109End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005110 if (consumed)
5111 *consumed = s - starts;
5112
Victor Stinner1d65d912015-10-05 13:43:50 +02005113 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005114 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005115 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005116
5117onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005118 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005119 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005120 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005121 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005122}
5123
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005124
Victor Stinner709d23d2019-05-02 14:56:30 -04005125PyObject *
5126PyUnicode_DecodeUTF8Stateful(const char *s,
5127 Py_ssize_t size,
5128 const char *errors,
5129 Py_ssize_t *consumed)
5130{
5131 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5132}
5133
5134
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005135/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5136 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005137
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005138 On success, write a pointer to a newly allocated wide character string into
5139 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5140 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005141
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005142 On memory allocation failure, return -1.
5143
5144 On decoding error (if surrogateescape is zero), return -2. If wlen is
5145 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5146 is not NULL, write the decoding error message into *reason. */
5147int
5148_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005149 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005151 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005152 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 wchar_t *unicode;
5154 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005155
Victor Stinner3d4226a2018-08-29 22:21:32 +02005156 int surrogateescape = 0;
5157 int surrogatepass = 0;
5158 switch (errors)
5159 {
5160 case _Py_ERROR_STRICT:
5161 break;
5162 case _Py_ERROR_SURROGATEESCAPE:
5163 surrogateescape = 1;
5164 break;
5165 case _Py_ERROR_SURROGATEPASS:
5166 surrogatepass = 1;
5167 break;
5168 default:
5169 return -3;
5170 }
5171
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005172 /* Note: size will always be longer than the resulting Unicode
5173 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005174 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005175 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005176 }
5177
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005178 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005179 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005180 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005181 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005182
5183 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005184 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005185 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005186 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005187 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005188#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005189 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005190#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005191 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005192#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005193 if (ch > 0xFF) {
5194#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005195 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005196#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005197 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005198 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005199 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5200 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5201#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005202 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005203 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005204 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005205 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005206 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005207
5208 if (surrogateescape) {
5209 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5210 }
5211 else {
5212 /* Is it a valid three-byte code? */
5213 if (surrogatepass
5214 && (e - s) >= 3
5215 && (s[0] & 0xf0) == 0xe0
5216 && (s[1] & 0xc0) == 0x80
5217 && (s[2] & 0xc0) == 0x80)
5218 {
5219 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5220 s += 3;
5221 unicode[outpos++] = ch;
5222 }
5223 else {
5224 PyMem_RawFree(unicode );
5225 if (reason != NULL) {
5226 switch (ch) {
5227 case 0:
5228 *reason = "unexpected end of data";
5229 break;
5230 case 1:
5231 *reason = "invalid start byte";
5232 break;
5233 /* 2, 3, 4 */
5234 default:
5235 *reason = "invalid continuation byte";
5236 break;
5237 }
5238 }
5239 if (wlen != NULL) {
5240 *wlen = s - orig_s;
5241 }
5242 return -2;
5243 }
5244 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005245 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005246 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005247 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005248 if (wlen) {
5249 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005250 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005251 *wstr = unicode;
5252 return 0;
5253}
5254
Victor Stinner5f9cf232019-03-19 01:46:25 +01005255
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005256wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005257_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5258 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005259{
5260 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005261 int res = _Py_DecodeUTF8Ex(arg, arglen,
5262 &wstr, wlen,
5263 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005264 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005265 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5266 assert(res != -3);
5267 if (wlen) {
5268 *wlen = (size_t)res;
5269 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005270 return NULL;
5271 }
5272 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005273}
5274
Antoine Pitrouab868312009-01-10 15:40:25 +00005275
Victor Stinnere47e6982017-12-21 15:45:16 +01005276/* UTF-8 encoder using the surrogateescape error handler .
5277
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005278 On success, return 0 and write the newly allocated character string (use
5279 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005280
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005281 On encoding failure, return -2 and write the position of the invalid
5282 surrogate character into *error_pos (if error_pos is set) and the decoding
5283 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005284
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005285 On memory allocation failure, return -1. */
5286int
5287_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005288 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005289{
5290 const Py_ssize_t max_char_size = 4;
5291 Py_ssize_t len = wcslen(text);
5292
5293 assert(len >= 0);
5294
Victor Stinner3d4226a2018-08-29 22:21:32 +02005295 int surrogateescape = 0;
5296 int surrogatepass = 0;
5297 switch (errors)
5298 {
5299 case _Py_ERROR_STRICT:
5300 break;
5301 case _Py_ERROR_SURROGATEESCAPE:
5302 surrogateescape = 1;
5303 break;
5304 case _Py_ERROR_SURROGATEPASS:
5305 surrogatepass = 1;
5306 break;
5307 default:
5308 return -3;
5309 }
5310
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005311 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5312 return -1;
5313 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005314 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005315 if (raw_malloc) {
5316 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005317 }
5318 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005319 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005320 }
5321 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005322 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005323 }
5324
5325 char *p = bytes;
5326 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005327 for (i = 0; i < len; ) {
5328 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005329 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005330 i++;
5331#if Py_UNICODE_SIZE == 2
5332 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5333 && i < len
5334 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5335 {
5336 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5337 i++;
5338 }
5339#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005340
5341 if (ch < 0x80) {
5342 /* Encode ASCII */
5343 *p++ = (char) ch;
5344
5345 }
5346 else if (ch < 0x0800) {
5347 /* Encode Latin-1 */
5348 *p++ = (char)(0xc0 | (ch >> 6));
5349 *p++ = (char)(0x80 | (ch & 0x3f));
5350 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005351 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005352 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005353 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005354 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005355 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005356 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005357 if (reason != NULL) {
5358 *reason = "encoding error";
5359 }
5360 if (raw_malloc) {
5361 PyMem_RawFree(bytes);
5362 }
5363 else {
5364 PyMem_Free(bytes);
5365 }
5366 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005367 }
5368 *p++ = (char)(ch & 0xff);
5369 }
5370 else if (ch < 0x10000) {
5371 *p++ = (char)(0xe0 | (ch >> 12));
5372 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5373 *p++ = (char)(0x80 | (ch & 0x3f));
5374 }
5375 else { /* ch >= 0x10000 */
5376 assert(ch <= MAX_UNICODE);
5377 /* Encode UCS4 Unicode ordinals */
5378 *p++ = (char)(0xf0 | (ch >> 18));
5379 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5380 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5381 *p++ = (char)(0x80 | (ch & 0x3f));
5382 }
5383 }
5384 *p++ = '\0';
5385
5386 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005387 char *bytes2;
5388 if (raw_malloc) {
5389 bytes2 = PyMem_RawRealloc(bytes, final_size);
5390 }
5391 else {
5392 bytes2 = PyMem_Realloc(bytes, final_size);
5393 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005394 if (bytes2 == NULL) {
5395 if (error_pos != NULL) {
5396 *error_pos = (size_t)-1;
5397 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005398 if (raw_malloc) {
5399 PyMem_RawFree(bytes);
5400 }
5401 else {
5402 PyMem_Free(bytes);
5403 }
5404 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005405 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005406 *str = bytes2;
5407 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005408}
5409
5410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005411/* Primary internal function which creates utf8 encoded bytes objects.
5412
5413 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005414 and allocate exactly as much space needed at the end. Else allocate the
5415 maximum possible needed (4 result bytes per Unicode character), and return
5416 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005417*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005418static PyObject *
5419unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5420 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005422 if (!PyUnicode_Check(unicode)) {
5423 PyErr_BadArgument();
5424 return NULL;
5425 }
5426
5427 if (PyUnicode_READY(unicode) == -1)
5428 return NULL;
5429
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005430 if (PyUnicode_UTF8(unicode))
5431 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5432 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005433
Inada Naoki02a4d572020-02-27 13:48:59 +09005434 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005435 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005436 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5437
5438 _PyBytesWriter writer;
5439 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005440
Benjamin Petersonead6b532011-12-20 17:23:42 -06005441 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005442 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005443 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005444 case PyUnicode_1BYTE_KIND:
5445 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5446 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005447 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5448 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005449 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005450 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5451 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005452 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005453 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5454 break;
Tim Peters602f7402002-04-27 18:03:26 +00005455 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005456
5457 if (end == NULL) {
5458 _PyBytesWriter_Dealloc(&writer);
5459 return NULL;
5460 }
5461 return _PyBytesWriter_Finish(&writer, end);
5462}
5463
5464static int
5465unicode_fill_utf8(PyObject *unicode)
5466{
5467 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5468 assert(!PyUnicode_IS_ASCII(unicode));
5469
5470 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005471 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005472 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5473
5474 _PyBytesWriter writer;
5475 char *end;
5476
5477 switch (kind) {
5478 default:
5479 Py_UNREACHABLE();
5480 case PyUnicode_1BYTE_KIND:
5481 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5482 _Py_ERROR_STRICT, NULL);
5483 break;
5484 case PyUnicode_2BYTE_KIND:
5485 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5486 _Py_ERROR_STRICT, NULL);
5487 break;
5488 case PyUnicode_4BYTE_KIND:
5489 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5490 _Py_ERROR_STRICT, NULL);
5491 break;
5492 }
5493 if (end == NULL) {
5494 _PyBytesWriter_Dealloc(&writer);
5495 return -1;
5496 }
5497
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005498 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005499 PyBytes_AS_STRING(writer.buffer);
5500 Py_ssize_t len = end - start;
5501
5502 char *cache = PyObject_MALLOC(len + 1);
5503 if (cache == NULL) {
5504 _PyBytesWriter_Dealloc(&writer);
5505 PyErr_NoMemory();
5506 return -1;
5507 }
5508 _PyUnicode_UTF8(unicode) = cache;
5509 _PyUnicode_UTF8_LENGTH(unicode) = len;
5510 memcpy(cache, start, len);
5511 cache[len] = '\0';
5512 _PyBytesWriter_Dealloc(&writer);
5513 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514}
5515
Alexander Belopolsky40018472011-02-26 01:02:56 +00005516PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005517_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5518{
5519 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5520}
5521
5522
5523PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005524PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5525 Py_ssize_t size,
5526 const char *errors)
5527{
5528 PyObject *v, *unicode;
5529
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005530 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005531 if (unicode == NULL)
5532 return NULL;
5533 v = _PyUnicode_AsUTF8String(unicode, errors);
5534 Py_DECREF(unicode);
5535 return v;
5536}
5537
5538PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005539PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005541 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542}
5543
Walter Dörwald41980ca2007-08-16 21:55:45 +00005544/* --- UTF-32 Codec ------------------------------------------------------- */
5545
5546PyObject *
5547PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 Py_ssize_t size,
5549 const char *errors,
5550 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005551{
5552 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5553}
5554
5555PyObject *
5556PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 Py_ssize_t size,
5558 const char *errors,
5559 int *byteorder,
5560 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005561{
5562 const char *starts = s;
5563 Py_ssize_t startinpos;
5564 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005565 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005566 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005567 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005568 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005569 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005570 PyObject *errorHandler = NULL;
5571 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005572
Andy Lestere6be9b52020-02-11 20:28:35 -06005573 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005574 e = q + size;
5575
5576 if (byteorder)
5577 bo = *byteorder;
5578
5579 /* Check for BOM marks (U+FEFF) in the input and adjust current
5580 byte order setting accordingly. In native mode, the leading BOM
5581 mark is skipped, in all other modes, it is copied to the output
5582 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005583 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005584 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005585 if (bom == 0x0000FEFF) {
5586 bo = -1;
5587 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005589 else if (bom == 0xFFFE0000) {
5590 bo = 1;
5591 q += 4;
5592 }
5593 if (byteorder)
5594 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005595 }
5596
Victor Stinnere64322e2012-10-30 23:12:47 +01005597 if (q == e) {
5598 if (consumed)
5599 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005600 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005601 }
5602
Victor Stinnere64322e2012-10-30 23:12:47 +01005603#ifdef WORDS_BIGENDIAN
5604 le = bo < 0;
5605#else
5606 le = bo <= 0;
5607#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005608 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005609
Victor Stinner8f674cc2013-04-17 23:02:17 +02005610 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005611 writer.min_length = (e - q + 3) / 4;
5612 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005613 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005614
Victor Stinnere64322e2012-10-30 23:12:47 +01005615 while (1) {
5616 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005617 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005618
Victor Stinnere64322e2012-10-30 23:12:47 +01005619 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005620 enum PyUnicode_Kind kind = writer.kind;
5621 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005622 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005624 if (le) {
5625 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005626 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005627 if (ch > maxch)
5628 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005629 if (kind != PyUnicode_1BYTE_KIND &&
5630 Py_UNICODE_IS_SURROGATE(ch))
5631 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005632 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005633 q += 4;
5634 } while (q <= last);
5635 }
5636 else {
5637 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005638 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005639 if (ch > maxch)
5640 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005641 if (kind != PyUnicode_1BYTE_KIND &&
5642 Py_UNICODE_IS_SURROGATE(ch))
5643 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005644 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005645 q += 4;
5646 } while (q <= last);
5647 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005648 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005649 }
5650
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005651 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005652 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005653 startinpos = ((const char *)q) - starts;
5654 endinpos = startinpos + 4;
5655 }
5656 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005657 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005659 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005661 startinpos = ((const char *)q) - starts;
5662 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005664 else {
5665 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005666 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005667 goto onError;
5668 q += 4;
5669 continue;
5670 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005671 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005672 startinpos = ((const char *)q) - starts;
5673 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005675
5676 /* The remaining input chars are ignored if the callback
5677 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005678 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005680 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005682 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005684 }
5685
Walter Dörwald41980ca2007-08-16 21:55:45 +00005686 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005688
Walter Dörwald41980ca2007-08-16 21:55:45 +00005689 Py_XDECREF(errorHandler);
5690 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005692
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005694 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005695 Py_XDECREF(errorHandler);
5696 Py_XDECREF(exc);
5697 return NULL;
5698}
5699
5700PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005701_PyUnicode_EncodeUTF32(PyObject *str,
5702 const char *errors,
5703 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005704{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005705 enum PyUnicode_Kind kind;
5706 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005707 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005708 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005709 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005710#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005711 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005712#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005713 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005714#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005715 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005716 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005717 PyObject *errorHandler = NULL;
5718 PyObject *exc = NULL;
5719 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005720
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005721 if (!PyUnicode_Check(str)) {
5722 PyErr_BadArgument();
5723 return NULL;
5724 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005725 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005726 return NULL;
5727 kind = PyUnicode_KIND(str);
5728 data = PyUnicode_DATA(str);
5729 len = PyUnicode_GET_LENGTH(str);
5730
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005731 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005732 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005733 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005734 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005735 if (v == NULL)
5736 return NULL;
5737
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005738 /* output buffer is 4-bytes aligned */
5739 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005740 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005741 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005742 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005743 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005744 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005745
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005746 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005747 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005748 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005749 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005750 else
5751 encoding = "utf-32";
5752
5753 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005754 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5755 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005756 }
5757
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005758 pos = 0;
5759 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005760 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005761
5762 if (kind == PyUnicode_2BYTE_KIND) {
5763 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5764 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005766 else {
5767 assert(kind == PyUnicode_4BYTE_KIND);
5768 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5769 &out, native_ordering);
5770 }
5771 if (pos == len)
5772 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005773
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005774 rep = unicode_encode_call_errorhandler(
5775 errors, &errorHandler,
5776 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005777 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005778 if (!rep)
5779 goto error;
5780
5781 if (PyBytes_Check(rep)) {
5782 repsize = PyBytes_GET_SIZE(rep);
5783 if (repsize & 3) {
5784 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005785 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005786 "surrogates not allowed");
5787 goto error;
5788 }
5789 moreunits = repsize / 4;
5790 }
5791 else {
5792 assert(PyUnicode_Check(rep));
5793 if (PyUnicode_READY(rep) < 0)
5794 goto error;
5795 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5796 if (!PyUnicode_IS_ASCII(rep)) {
5797 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005798 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005799 "surrogates not allowed");
5800 goto error;
5801 }
5802 }
5803
5804 /* four bytes are reserved for each surrogate */
5805 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005806 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005807 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005808 /* integer overflow */
5809 PyErr_NoMemory();
5810 goto error;
5811 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005812 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005813 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005814 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005815 }
5816
5817 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005818 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005819 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005820 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005821 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005822 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5823 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005824 }
5825
5826 Py_CLEAR(rep);
5827 }
5828
5829 /* Cut back to size actually needed. This is necessary for, for example,
5830 encoding of a string containing isolated surrogates and the 'ignore'
5831 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005832 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005833 if (nsize != PyBytes_GET_SIZE(v))
5834 _PyBytes_Resize(&v, nsize);
5835 Py_XDECREF(errorHandler);
5836 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005837 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005838 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 error:
5840 Py_XDECREF(rep);
5841 Py_XDECREF(errorHandler);
5842 Py_XDECREF(exc);
5843 Py_XDECREF(v);
5844 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005845}
5846
Alexander Belopolsky40018472011-02-26 01:02:56 +00005847PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005848PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5849 Py_ssize_t size,
5850 const char *errors,
5851 int byteorder)
5852{
5853 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005854 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005855 if (tmp == NULL)
5856 return NULL;
5857 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5858 Py_DECREF(tmp);
5859 return result;
5860}
5861
5862PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005863PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005864{
Victor Stinnerb960b342011-11-20 19:12:52 +01005865 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005866}
5867
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868/* --- UTF-16 Codec ------------------------------------------------------- */
5869
Tim Peters772747b2001-08-09 22:21:55 +00005870PyObject *
5871PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 Py_ssize_t size,
5873 const char *errors,
5874 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875{
Walter Dörwald69652032004-09-07 20:24:22 +00005876 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5877}
5878
5879PyObject *
5880PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 Py_ssize_t size,
5882 const char *errors,
5883 int *byteorder,
5884 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005885{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005886 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005887 Py_ssize_t startinpos;
5888 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005889 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005890 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005891 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005892 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005893 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005894 PyObject *errorHandler = NULL;
5895 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005896 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897
Andy Lestere6be9b52020-02-11 20:28:35 -06005898 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005899 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900
5901 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005902 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005904 /* Check for BOM marks (U+FEFF) in the input and adjust current
5905 byte order setting accordingly. In native mode, the leading BOM
5906 mark is skipped, in all other modes, it is copied to the output
5907 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005908 if (bo == 0 && size >= 2) {
5909 const Py_UCS4 bom = (q[1] << 8) | q[0];
5910 if (bom == 0xFEFF) {
5911 q += 2;
5912 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005914 else if (bom == 0xFFFE) {
5915 q += 2;
5916 bo = 1;
5917 }
5918 if (byteorder)
5919 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921
Antoine Pitrou63065d72012-05-15 23:48:04 +02005922 if (q == e) {
5923 if (consumed)
5924 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005925 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005926 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005927
Christian Heimes743e0cd2012-10-17 23:52:17 +02005928#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005929 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005930 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005931#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005932 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005933 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005934#endif
Tim Peters772747b2001-08-09 22:21:55 +00005935
Antoine Pitrou63065d72012-05-15 23:48:04 +02005936 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005937 character count normally. Error handler will take care of
5938 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005939 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005940 writer.min_length = (e - q + 1) / 2;
5941 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005942 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005943
Antoine Pitrou63065d72012-05-15 23:48:04 +02005944 while (1) {
5945 Py_UCS4 ch = 0;
5946 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005947 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005948 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005949 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005950 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005951 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005952 native_ordering);
5953 else
5954 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005955 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005956 native_ordering);
5957 } else if (kind == PyUnicode_2BYTE_KIND) {
5958 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005959 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005960 native_ordering);
5961 } else {
5962 assert(kind == PyUnicode_4BYTE_KIND);
5963 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005964 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005965 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005966 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005967 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005968
Antoine Pitrou63065d72012-05-15 23:48:04 +02005969 switch (ch)
5970 {
5971 case 0:
5972 /* remaining byte at the end? (size should be even) */
5973 if (q == e || consumed)
5974 goto End;
5975 errmsg = "truncated data";
5976 startinpos = ((const char *)q) - starts;
5977 endinpos = ((const char *)e) - starts;
5978 break;
5979 /* The remaining input chars are ignored if the callback
5980 chooses to skip the input */
5981 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005982 q -= 2;
5983 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005984 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005985 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005986 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005987 endinpos = ((const char *)e) - starts;
5988 break;
5989 case 2:
5990 errmsg = "illegal encoding";
5991 startinpos = ((const char *)q) - 2 - starts;
5992 endinpos = startinpos + 2;
5993 break;
5994 case 3:
5995 errmsg = "illegal UTF-16 surrogate";
5996 startinpos = ((const char *)q) - 4 - starts;
5997 endinpos = startinpos + 2;
5998 break;
5999 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006000 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006001 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 continue;
6003 }
6004
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006005 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006006 errors,
6007 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006008 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006009 &starts,
6010 (const char **)&e,
6011 &startinpos,
6012 &endinpos,
6013 &exc,
6014 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006015 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 }
6018
Antoine Pitrou63065d72012-05-15 23:48:04 +02006019End:
Walter Dörwald69652032004-09-07 20:24:22 +00006020 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006022
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 Py_XDECREF(errorHandler);
6024 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006025 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006028 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029 Py_XDECREF(errorHandler);
6030 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 return NULL;
6032}
6033
Tim Peters772747b2001-08-09 22:21:55 +00006034PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006035_PyUnicode_EncodeUTF16(PyObject *str,
6036 const char *errors,
6037 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006039 enum PyUnicode_Kind kind;
6040 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006041 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006042 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006043 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006044 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006045#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006046 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006047#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006048 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006049#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006050 const char *encoding;
6051 Py_ssize_t nsize, pos;
6052 PyObject *errorHandler = NULL;
6053 PyObject *exc = NULL;
6054 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006055
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006056 if (!PyUnicode_Check(str)) {
6057 PyErr_BadArgument();
6058 return NULL;
6059 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006060 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006061 return NULL;
6062 kind = PyUnicode_KIND(str);
6063 data = PyUnicode_DATA(str);
6064 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006065
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006066 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006067 if (kind == PyUnicode_4BYTE_KIND) {
6068 const Py_UCS4 *in = (const Py_UCS4 *)data;
6069 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006070 while (in < end) {
6071 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006072 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006073 }
6074 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006075 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006076 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006078 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006079 nsize = len + pairs + (byteorder == 0);
6080 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006081 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006085 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006086 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006087 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006088 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006089 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006090 }
6091 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006092 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006093 }
Tim Peters772747b2001-08-09 22:21:55 +00006094
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006095 if (kind == PyUnicode_1BYTE_KIND) {
6096 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6097 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006098 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006099
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006100 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006101 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006102 }
6103 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006104 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006105 }
6106 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006107 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006108 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006109
6110 pos = 0;
6111 while (pos < len) {
6112 Py_ssize_t repsize, moreunits;
6113
6114 if (kind == PyUnicode_2BYTE_KIND) {
6115 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6116 &out, native_ordering);
6117 }
6118 else {
6119 assert(kind == PyUnicode_4BYTE_KIND);
6120 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6121 &out, native_ordering);
6122 }
6123 if (pos == len)
6124 break;
6125
6126 rep = unicode_encode_call_errorhandler(
6127 errors, &errorHandler,
6128 encoding, "surrogates not allowed",
6129 str, &exc, pos, pos + 1, &pos);
6130 if (!rep)
6131 goto error;
6132
6133 if (PyBytes_Check(rep)) {
6134 repsize = PyBytes_GET_SIZE(rep);
6135 if (repsize & 1) {
6136 raise_encode_exception(&exc, encoding,
6137 str, pos - 1, pos,
6138 "surrogates not allowed");
6139 goto error;
6140 }
6141 moreunits = repsize / 2;
6142 }
6143 else {
6144 assert(PyUnicode_Check(rep));
6145 if (PyUnicode_READY(rep) < 0)
6146 goto error;
6147 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6148 if (!PyUnicode_IS_ASCII(rep)) {
6149 raise_encode_exception(&exc, encoding,
6150 str, pos - 1, pos,
6151 "surrogates not allowed");
6152 goto error;
6153 }
6154 }
6155
6156 /* two bytes are reserved for each surrogate */
6157 if (moreunits > 1) {
6158 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006159 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006160 /* integer overflow */
6161 PyErr_NoMemory();
6162 goto error;
6163 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006164 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006165 goto error;
6166 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6167 }
6168
6169 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006170 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006171 out += moreunits;
6172 } else /* rep is unicode */ {
6173 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6174 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6175 &out, native_ordering);
6176 }
6177
6178 Py_CLEAR(rep);
6179 }
6180
6181 /* Cut back to size actually needed. This is necessary for, for example,
6182 encoding of a string containing isolated surrogates and the 'ignore' handler
6183 is used. */
6184 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6185 if (nsize != PyBytes_GET_SIZE(v))
6186 _PyBytes_Resize(&v, nsize);
6187 Py_XDECREF(errorHandler);
6188 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006189 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006190 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006191 error:
6192 Py_XDECREF(rep);
6193 Py_XDECREF(errorHandler);
6194 Py_XDECREF(exc);
6195 Py_XDECREF(v);
6196 return NULL;
6197#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198}
6199
Alexander Belopolsky40018472011-02-26 01:02:56 +00006200PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006201PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6202 Py_ssize_t size,
6203 const char *errors,
6204 int byteorder)
6205{
6206 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006207 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006208 if (tmp == NULL)
6209 return NULL;
6210 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6211 Py_DECREF(tmp);
6212 return result;
6213}
6214
6215PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006216PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006218 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219}
6220
6221/* --- Unicode Escape Codec ----------------------------------------------- */
6222
Fredrik Lundh06d12682001-01-24 07:59:11 +00006223static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006224
Alexander Belopolsky40018472011-02-26 01:02:56 +00006225PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006226_PyUnicode_DecodeUnicodeEscape(const char *s,
6227 Py_ssize_t size,
6228 const char *errors,
6229 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006231 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006232 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006234 PyObject *errorHandler = NULL;
6235 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006236
Eric V. Smith42454af2016-10-31 09:22:08 -04006237 // so we can remember if we've seen an invalid escape char or not
6238 *first_invalid_escape = NULL;
6239
Victor Stinner62ec3312016-09-06 17:04:34 -07006240 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006241 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 }
6243 /* Escaped strings will always be longer than the resulting
6244 Unicode string, so we start with size here and then reduce the
6245 length after conversion to the true value.
6246 (but if the error callback returns a long replacement string
6247 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006248 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 writer.min_length = size;
6250 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6251 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006252 }
6253
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 end = s + size;
6255 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 unsigned char c = (unsigned char) *s++;
6257 Py_UCS4 ch;
6258 int count;
6259 Py_ssize_t startinpos;
6260 Py_ssize_t endinpos;
6261 const char *message;
6262
6263#define WRITE_ASCII_CHAR(ch) \
6264 do { \
6265 assert(ch <= 127); \
6266 assert(writer.pos < writer.size); \
6267 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6268 } while(0)
6269
6270#define WRITE_CHAR(ch) \
6271 do { \
6272 if (ch <= writer.maxchar) { \
6273 assert(writer.pos < writer.size); \
6274 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6275 } \
6276 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6277 goto onError; \
6278 } \
6279 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280
6281 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006282 if (c != '\\') {
6283 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 continue;
6285 }
6286
Victor Stinner62ec3312016-09-06 17:04:34 -07006287 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006289 if (s >= end) {
6290 message = "\\ at end of string";
6291 goto error;
6292 }
6293 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006294
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006296 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006299 case '\n': continue;
6300 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6301 case '\'': WRITE_ASCII_CHAR('\''); continue;
6302 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6303 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006304 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6306 case 't': WRITE_ASCII_CHAR('\t'); continue;
6307 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6308 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006309 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006310 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006311 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006312 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 case '0': case '1': case '2': case '3':
6316 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006318 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 ch = (ch<<3) + *s++ - '0';
6320 if (s < end && '0' <= *s && *s <= '7') {
6321 ch = (ch<<3) + *s++ - '0';
6322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006324 WRITE_CHAR(ch);
6325 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 /* hex escapes */
6328 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006331 message = "truncated \\xXX escape";
6332 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006337 message = "truncated \\uXXXX escape";
6338 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006341 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006342 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006343 message = "truncated \\UXXXXXXXX escape";
6344 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006346 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 ch <<= 4;
6348 if (c >= '0' && c <= '9') {
6349 ch += c - '0';
6350 }
6351 else if (c >= 'a' && c <= 'f') {
6352 ch += c - ('a' - 10);
6353 }
6354 else if (c >= 'A' && c <= 'F') {
6355 ch += c - ('A' - 10);
6356 }
6357 else {
6358 break;
6359 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006360 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006361 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006362 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006363 }
6364
6365 /* when we get here, ch is a 32-bit unicode character */
6366 if (ch > MAX_UNICODE) {
6367 message = "illegal Unicode character";
6368 goto error;
6369 }
6370
6371 WRITE_CHAR(ch);
6372 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006373
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006375 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006376 if (ucnhash_CAPI == NULL) {
6377 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006378 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6379 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006380 if (ucnhash_CAPI == NULL) {
6381 PyErr_SetString(
6382 PyExc_UnicodeError,
6383 "\\N escapes not supported (can't load unicodedata module)"
6384 );
6385 goto onError;
6386 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006387 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006388
6389 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006390 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006391 const char *start = ++s;
6392 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006393 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006394 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006395 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006396 namelen = s - start;
6397 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006398 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006399 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006400 ch = 0xffffffff; /* in case 'getcode' messes up */
6401 if (namelen <= INT_MAX &&
6402 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6403 &ch, 0)) {
6404 assert(ch <= MAX_UNICODE);
6405 WRITE_CHAR(ch);
6406 continue;
6407 }
6408 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006409 }
6410 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006411 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006412
6413 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006414 if (*first_invalid_escape == NULL) {
6415 *first_invalid_escape = s-1; /* Back up one char, since we've
6416 already incremented s. */
6417 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 WRITE_ASCII_CHAR('\\');
6419 WRITE_CHAR(c);
6420 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006422
6423 error:
6424 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006426 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006427 errors, &errorHandler,
6428 "unicodeescape", message,
6429 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006430 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006431 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006432 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006433 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006434
6435#undef WRITE_ASCII_CHAR
6436#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006438
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006439 Py_XDECREF(errorHandler);
6440 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006441 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006442
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006444 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006445 Py_XDECREF(errorHandler);
6446 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 return NULL;
6448}
6449
Eric V. Smith42454af2016-10-31 09:22:08 -04006450PyObject *
6451PyUnicode_DecodeUnicodeEscape(const char *s,
6452 Py_ssize_t size,
6453 const char *errors)
6454{
6455 const char *first_invalid_escape;
6456 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6457 &first_invalid_escape);
6458 if (result == NULL)
6459 return NULL;
6460 if (first_invalid_escape != NULL) {
6461 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6462 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006463 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006464 Py_DECREF(result);
6465 return NULL;
6466 }
6467 }
6468 return result;
6469}
6470
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006471/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472
Alexander Belopolsky40018472011-02-26 01:02:56 +00006473PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006474PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006476 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006477 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006479 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006480 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006481 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482
Ezio Melottie7f90372012-10-05 03:33:31 +03006483 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006484 escape.
6485
Ezio Melottie7f90372012-10-05 03:33:31 +03006486 For UCS1 strings it's '\xxx', 4 bytes per source character.
6487 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6488 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006489 */
6490
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006491 if (!PyUnicode_Check(unicode)) {
6492 PyErr_BadArgument();
6493 return NULL;
6494 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006495 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006496 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006497 }
Victor Stinner358af132015-10-12 22:36:57 +02006498
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006499 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006500 if (len == 0) {
6501 return PyBytes_FromStringAndSize(NULL, 0);
6502 }
6503
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006504 kind = PyUnicode_KIND(unicode);
6505 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006506 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6507 bytes, and 1 byte characters 4. */
6508 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006509 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006510 return PyErr_NoMemory();
6511 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006512 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006513 if (repr == NULL) {
6514 return NULL;
6515 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006516
Victor Stinner62ec3312016-09-06 17:04:34 -07006517 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006518 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006519 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006520
Victor Stinner62ec3312016-09-06 17:04:34 -07006521 /* U+0000-U+00ff range */
6522 if (ch < 0x100) {
6523 if (ch >= ' ' && ch < 127) {
6524 if (ch != '\\') {
6525 /* Copy printable US ASCII as-is */
6526 *p++ = (char) ch;
6527 }
6528 /* Escape backslashes */
6529 else {
6530 *p++ = '\\';
6531 *p++ = '\\';
6532 }
6533 }
Victor Stinner358af132015-10-12 22:36:57 +02006534
Victor Stinner62ec3312016-09-06 17:04:34 -07006535 /* Map special whitespace to '\t', \n', '\r' */
6536 else if (ch == '\t') {
6537 *p++ = '\\';
6538 *p++ = 't';
6539 }
6540 else if (ch == '\n') {
6541 *p++ = '\\';
6542 *p++ = 'n';
6543 }
6544 else if (ch == '\r') {
6545 *p++ = '\\';
6546 *p++ = 'r';
6547 }
6548
6549 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6550 else {
6551 *p++ = '\\';
6552 *p++ = 'x';
6553 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6554 *p++ = Py_hexdigits[ch & 0x000F];
6555 }
Tim Petersced69f82003-09-16 20:30:58 +00006556 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006557 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006558 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 *p++ = '\\';
6560 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006561 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6562 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6563 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6564 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006566 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6567 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006568
Victor Stinner62ec3312016-09-06 17:04:34 -07006569 /* Make sure that the first two digits are zero */
6570 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006571 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006572 *p++ = 'U';
6573 *p++ = '0';
6574 *p++ = '0';
6575 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6576 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6577 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6578 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6579 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6580 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583
Victor Stinner62ec3312016-09-06 17:04:34 -07006584 assert(p - PyBytes_AS_STRING(repr) > 0);
6585 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6586 return NULL;
6587 }
6588 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589}
6590
Alexander Belopolsky40018472011-02-26 01:02:56 +00006591PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006592PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6593 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006595 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006596 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006597 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006599 }
6600
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006601 result = PyUnicode_AsUnicodeEscapeString(tmp);
6602 Py_DECREF(tmp);
6603 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604}
6605
6606/* --- Raw Unicode Escape Codec ------------------------------------------- */
6607
Alexander Belopolsky40018472011-02-26 01:02:56 +00006608PyObject *
6609PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006610 Py_ssize_t size,
6611 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006613 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006614 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616 PyObject *errorHandler = NULL;
6617 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006618
Victor Stinner62ec3312016-09-06 17:04:34 -07006619 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006620 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006621 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006622
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 /* Escaped strings will always be longer than the resulting
6624 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625 length after conversion to the true value. (But decoding error
6626 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006627 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006628 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006629 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6630 goto onError;
6631 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006632
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 end = s + size;
6634 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006635 unsigned char c = (unsigned char) *s++;
6636 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006637 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006638 Py_ssize_t startinpos;
6639 Py_ssize_t endinpos;
6640 const char *message;
6641
6642#define WRITE_CHAR(ch) \
6643 do { \
6644 if (ch <= writer.maxchar) { \
6645 assert(writer.pos < writer.size); \
6646 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6647 } \
6648 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6649 goto onError; \
6650 } \
6651 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006654 if (c != '\\' || s >= end) {
6655 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006657 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006658
Victor Stinner62ec3312016-09-06 17:04:34 -07006659 c = (unsigned char) *s++;
6660 if (c == 'u') {
6661 count = 4;
6662 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006664 else if (c == 'U') {
6665 count = 8;
6666 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006667 }
6668 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006669 assert(writer.pos < writer.size);
6670 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6671 WRITE_CHAR(c);
6672 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006673 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006674 startinpos = s - starts - 2;
6675
6676 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6677 for (ch = 0; count && s < end; ++s, --count) {
6678 c = (unsigned char)*s;
6679 ch <<= 4;
6680 if (c >= '0' && c <= '9') {
6681 ch += c - '0';
6682 }
6683 else if (c >= 'a' && c <= 'f') {
6684 ch += c - ('a' - 10);
6685 }
6686 else if (c >= 'A' && c <= 'F') {
6687 ch += c - ('A' - 10);
6688 }
6689 else {
6690 break;
6691 }
6692 }
6693 if (!count) {
6694 if (ch <= MAX_UNICODE) {
6695 WRITE_CHAR(ch);
6696 continue;
6697 }
6698 message = "\\Uxxxxxxxx out of range";
6699 }
6700
6701 endinpos = s-starts;
6702 writer.min_length = end - s + writer.pos;
6703 if (unicode_decode_call_errorhandler_writer(
6704 errors, &errorHandler,
6705 "rawunicodeescape", message,
6706 &starts, &end, &startinpos, &endinpos, &exc, &s,
6707 &writer)) {
6708 goto onError;
6709 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006710 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006711
6712#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 Py_XDECREF(errorHandler);
6715 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006716 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006717
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006719 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006720 Py_XDECREF(errorHandler);
6721 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006723
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724}
6725
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006726
Alexander Belopolsky40018472011-02-26 01:02:56 +00006727PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006728PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
Victor Stinner62ec3312016-09-06 17:04:34 -07006730 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006732 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006733 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006734 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006735 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006737 if (!PyUnicode_Check(unicode)) {
6738 PyErr_BadArgument();
6739 return NULL;
6740 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006741 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006742 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006743 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006744 kind = PyUnicode_KIND(unicode);
6745 data = PyUnicode_DATA(unicode);
6746 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006747 if (kind == PyUnicode_1BYTE_KIND) {
6748 return PyBytes_FromStringAndSize(data, len);
6749 }
Victor Stinner0e368262011-11-10 20:12:49 +01006750
Victor Stinner62ec3312016-09-06 17:04:34 -07006751 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6752 bytes, and 1 byte characters 4. */
6753 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006754
Victor Stinner62ec3312016-09-06 17:04:34 -07006755 if (len > PY_SSIZE_T_MAX / expandsize) {
6756 return PyErr_NoMemory();
6757 }
6758 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6759 if (repr == NULL) {
6760 return NULL;
6761 }
6762 if (len == 0) {
6763 return repr;
6764 }
6765
6766 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006767 for (pos = 0; pos < len; pos++) {
6768 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006769
Victor Stinner62ec3312016-09-06 17:04:34 -07006770 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6771 if (ch < 0x100) {
6772 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006773 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006774 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006775 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 *p++ = '\\';
6777 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006778 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6779 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6780 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6781 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006783 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6784 else {
6785 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6786 *p++ = '\\';
6787 *p++ = 'U';
6788 *p++ = '0';
6789 *p++ = '0';
6790 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6791 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6792 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6793 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6794 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6795 *p++ = Py_hexdigits[ch & 15];
6796 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006798
Victor Stinner62ec3312016-09-06 17:04:34 -07006799 assert(p > PyBytes_AS_STRING(repr));
6800 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6801 return NULL;
6802 }
6803 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804}
6805
Alexander Belopolsky40018472011-02-26 01:02:56 +00006806PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006807PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6808 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006810 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006811 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006812 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006813 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006814 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6815 Py_DECREF(tmp);
6816 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817}
6818
6819/* --- Latin-1 Codec ------------------------------------------------------ */
6820
Alexander Belopolsky40018472011-02-26 01:02:56 +00006821PyObject *
6822PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006823 Py_ssize_t size,
6824 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006827 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828}
6829
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006830/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006831static void
6832make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006833 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006834 PyObject *unicode,
6835 Py_ssize_t startpos, Py_ssize_t endpos,
6836 const char *reason)
6837{
6838 if (*exceptionObject == NULL) {
6839 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006840 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006841 encoding, unicode, startpos, endpos, reason);
6842 }
6843 else {
6844 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6845 goto onError;
6846 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6847 goto onError;
6848 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6849 goto onError;
6850 return;
6851 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006852 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006853 }
6854}
6855
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006856/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006857static void
6858raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006859 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006860 PyObject *unicode,
6861 Py_ssize_t startpos, Py_ssize_t endpos,
6862 const char *reason)
6863{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006864 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006865 encoding, unicode, startpos, endpos, reason);
6866 if (*exceptionObject != NULL)
6867 PyCodec_StrictErrors(*exceptionObject);
6868}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006869
6870/* error handling callback helper:
6871 build arguments, call the callback and check the arguments,
6872 put the result into newpos and return the replacement string, which
6873 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006874static PyObject *
6875unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006876 PyObject **errorHandler,
6877 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006878 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006879 Py_ssize_t startpos, Py_ssize_t endpos,
6880 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006881{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006882 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006883 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006884 PyObject *restuple;
6885 PyObject *resunicode;
6886
6887 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006889 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006891 }
6892
Benjamin Petersonbac79492012-01-14 13:34:47 -05006893 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006894 return NULL;
6895 len = PyUnicode_GET_LENGTH(unicode);
6896
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006897 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006898 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006901
Petr Viktorinffd97532020-02-11 17:46:57 +01006902 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006903 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006906 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 Py_DECREF(restuple);
6908 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006909 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006910 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 &resunicode, newpos)) {
6912 Py_DECREF(restuple);
6913 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006915 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6916 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6917 Py_DECREF(restuple);
6918 return NULL;
6919 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006921 *newpos = len + *newpos;
6922 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006923 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 Py_DECREF(restuple);
6925 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006926 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006927 Py_INCREF(resunicode);
6928 Py_DECREF(restuple);
6929 return resunicode;
6930}
6931
Alexander Belopolsky40018472011-02-26 01:02:56 +00006932static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006933unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006934 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006935 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006937 /* input state */
6938 Py_ssize_t pos=0, size;
6939 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006940 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 /* pointer into the output */
6942 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006943 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6944 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006945 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006947 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006948 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006949 /* output object */
6950 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006951
Benjamin Petersonbac79492012-01-14 13:34:47 -05006952 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006953 return NULL;
6954 size = PyUnicode_GET_LENGTH(unicode);
6955 kind = PyUnicode_KIND(unicode);
6956 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006957 /* allocate enough for a simple encoding without
6958 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006959 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006960 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006961
6962 _PyBytesWriter_Init(&writer);
6963 str = _PyBytesWriter_Alloc(&writer, size);
6964 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006965 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006966
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006967 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006968 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006969
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006971 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006973 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006974 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006975 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006977 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006979 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006980 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006982
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006983 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006984 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006985
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006986 /* Only overallocate the buffer if it's not the last write */
6987 writer.overallocate = (collend < size);
6988
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006990 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006991 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006992
6993 switch (error_handler) {
6994 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006995 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006996 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006997
6998 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006999 memset(str, '?', collend - collstart);
7000 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007001 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007002 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007003 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 break;
Victor Stinner50149202015-09-22 00:26:54 +02007005
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007006 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007007 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007008 writer.min_size -= (collend - collstart);
7009 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007010 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007011 if (str == NULL)
7012 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007013 pos = collend;
7014 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007015
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007016 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007017 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007018 writer.min_size -= (collend - collstart);
7019 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007020 unicode, collstart, collend);
7021 if (str == NULL)
7022 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007023 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 break;
Victor Stinner50149202015-09-22 00:26:54 +02007025
Victor Stinnerc3713e92015-09-29 12:32:13 +02007026 case _Py_ERROR_SURROGATEESCAPE:
7027 for (i = collstart; i < collend; ++i) {
7028 ch = PyUnicode_READ(kind, data, i);
7029 if (ch < 0xdc80 || 0xdcff < ch) {
7030 /* Not a UTF-8b surrogate */
7031 break;
7032 }
7033 *str++ = (char)(ch - 0xdc00);
7034 ++pos;
7035 }
7036 if (i >= collend)
7037 break;
7038 collstart = pos;
7039 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007040 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007041
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007043 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7044 encoding, reason, unicode, &exc,
7045 collstart, collend, &newpos);
7046 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007048
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007049 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007050 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007051
Victor Stinner6bd525b2015-10-09 13:10:05 +02007052 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007053 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007054 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007055 PyBytes_AS_STRING(rep),
7056 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007057 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007058 else {
7059 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007060
Victor Stinner6bd525b2015-10-09 13:10:05 +02007061 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007063
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007064 if (limit == 256 ?
7065 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7066 !PyUnicode_IS_ASCII(rep))
7067 {
7068 /* Not all characters are smaller than limit */
7069 raise_encode_exception(&exc, encoding, unicode,
7070 collstart, collend, reason);
7071 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007073 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7074 str = _PyBytesWriter_WriteBytes(&writer, str,
7075 PyUnicode_DATA(rep),
7076 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007078 if (str == NULL)
7079 goto onError;
7080
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007081 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007082 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007083 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007084
7085 /* If overallocation was disabled, ensure that it was the last
7086 write. Otherwise, we missed an optimization */
7087 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007088 }
7089 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007090
Victor Stinner50149202015-09-22 00:26:54 +02007091 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007092 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007093 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007094
7095 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007096 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007097 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007098 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007099 Py_XDECREF(exc);
7100 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007101}
7102
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007103/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007104PyObject *
7105PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007106 Py_ssize_t size,
7107 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007109 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007110 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007111 if (unicode == NULL)
7112 return NULL;
7113 result = unicode_encode_ucs1(unicode, errors, 256);
7114 Py_DECREF(unicode);
7115 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116}
7117
Alexander Belopolsky40018472011-02-26 01:02:56 +00007118PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007119_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120{
7121 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 PyErr_BadArgument();
7123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007125 if (PyUnicode_READY(unicode) == -1)
7126 return NULL;
7127 /* Fast path: if it is a one-byte string, construct
7128 bytes object directly. */
7129 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7130 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7131 PyUnicode_GET_LENGTH(unicode));
7132 /* Non-Latin-1 characters present. Defer to above function to
7133 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007134 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007135}
7136
7137PyObject*
7138PyUnicode_AsLatin1String(PyObject *unicode)
7139{
7140 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141}
7142
7143/* --- 7-bit ASCII Codec -------------------------------------------------- */
7144
Alexander Belopolsky40018472011-02-26 01:02:56 +00007145PyObject *
7146PyUnicode_DecodeASCII(const char *s,
7147 Py_ssize_t size,
7148 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007150 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007151 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007152 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007153 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007154 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007155
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007157 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007158
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007160 if (size == 1 && (unsigned char)s[0] < 128)
7161 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007162
Inada Naoki770847a2019-06-24 12:30:24 +09007163 // Shortcut for simple case
7164 PyObject *u = PyUnicode_New(size, 127);
7165 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007166 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007167 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007168 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007169 if (outpos == size) {
7170 return u;
7171 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007172
Inada Naoki770847a2019-06-24 12:30:24 +09007173 _PyUnicodeWriter writer;
7174 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007175 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007176
Inada Naoki770847a2019-06-24 12:30:24 +09007177 s += outpos;
7178 int kind = writer.kind;
7179 void *data = writer.data;
7180 Py_ssize_t startinpos, endinpos;
7181
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007182 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007183 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007185 PyUnicode_WRITE(kind, data, writer.pos, c);
7186 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007188 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007190
7191 /* byte outsize range 0x00..0x7f: call the error handler */
7192
7193 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007194 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007195
7196 switch (error_handler)
7197 {
7198 case _Py_ERROR_REPLACE:
7199 case _Py_ERROR_SURROGATEESCAPE:
7200 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007201 but we may switch to UCS2 at the first write */
7202 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7203 goto onError;
7204 kind = writer.kind;
7205 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007206
7207 if (error_handler == _Py_ERROR_REPLACE)
7208 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7209 else
7210 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7211 writer.pos++;
7212 ++s;
7213 break;
7214
7215 case _Py_ERROR_IGNORE:
7216 ++s;
7217 break;
7218
7219 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 startinpos = s-starts;
7221 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007222 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007223 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 "ascii", "ordinal not in range(128)",
7225 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007226 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007228 kind = writer.kind;
7229 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007232 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007233 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007234 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007235
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007237 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007238 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007239 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 return NULL;
7241}
7242
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007243/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007244PyObject *
7245PyUnicode_EncodeASCII(const Py_UNICODE *p,
7246 Py_ssize_t size,
7247 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007249 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007250 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007251 if (unicode == NULL)
7252 return NULL;
7253 result = unicode_encode_ucs1(unicode, errors, 128);
7254 Py_DECREF(unicode);
7255 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256}
7257
Alexander Belopolsky40018472011-02-26 01:02:56 +00007258PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007259_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260{
7261 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 PyErr_BadArgument();
7263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007265 if (PyUnicode_READY(unicode) == -1)
7266 return NULL;
7267 /* Fast path: if it is an ASCII-only string, construct bytes object
7268 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007269 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007270 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7271 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007272 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007273}
7274
7275PyObject *
7276PyUnicode_AsASCIIString(PyObject *unicode)
7277{
7278 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279}
7280
Steve Dowercc16be82016-09-08 10:35:16 -07007281#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007282
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007283/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007284
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007285#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007286#define NEED_RETRY
7287#endif
7288
Steve Dower7ebdda02019-08-21 16:22:33 -07007289/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7290 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7291 both cases also and avoids partial characters overrunning the
7292 length limit in MultiByteToWideChar on Windows */
7293#define DECODING_CHUNK_SIZE (INT_MAX/4)
7294
Victor Stinner3a50e702011-10-18 21:21:00 +02007295#ifndef WC_ERR_INVALID_CHARS
7296# define WC_ERR_INVALID_CHARS 0x0080
7297#endif
7298
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007299static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007300code_page_name(UINT code_page, PyObject **obj)
7301{
7302 *obj = NULL;
7303 if (code_page == CP_ACP)
7304 return "mbcs";
7305 if (code_page == CP_UTF7)
7306 return "CP_UTF7";
7307 if (code_page == CP_UTF8)
7308 return "CP_UTF8";
7309
7310 *obj = PyBytes_FromFormat("cp%u", code_page);
7311 if (*obj == NULL)
7312 return NULL;
7313 return PyBytes_AS_STRING(*obj);
7314}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315
Victor Stinner3a50e702011-10-18 21:21:00 +02007316static DWORD
7317decode_code_page_flags(UINT code_page)
7318{
7319 if (code_page == CP_UTF7) {
7320 /* The CP_UTF7 decoder only supports flags=0 */
7321 return 0;
7322 }
7323 else
7324 return MB_ERR_INVALID_CHARS;
7325}
7326
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007327/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 * Decode a byte string from a Windows code page into unicode object in strict
7329 * mode.
7330 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007331 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7332 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007334static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007335decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007336 wchar_t **buf,
7337 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 const char *in,
7339 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007340{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007341 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007342 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007344
7345 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007346 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007347 while ((outsize = MultiByteToWideChar(code_page, flags,
7348 in, insize, NULL, 0)) <= 0)
7349 {
7350 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7351 goto error;
7352 }
7353 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7354 flags = 0;
7355 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007356
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007357 /* Extend a wchar_t* buffer */
7358 Py_ssize_t n = *bufsize; /* Get the current length */
7359 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7360 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007361 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007362 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363
7364 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7366 if (outsize <= 0)
7367 goto error;
7368 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007369
Victor Stinner3a50e702011-10-18 21:21:00 +02007370error:
7371 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7372 return -2;
7373 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007374 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375}
7376
Victor Stinner3a50e702011-10-18 21:21:00 +02007377/*
7378 * Decode a byte string from a code page into unicode object with an error
7379 * handler.
7380 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007381 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007382 * UnicodeDecodeError exception and returns -1 on error.
7383 */
7384static int
7385decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007386 wchar_t **buf,
7387 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007388 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007389 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007390{
7391 const char *startin = in;
7392 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007393 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 /* Ideally, we should get reason from FormatMessage. This is the Windows
7395 2000 English version of the message. */
7396 const char *reason = "No mapping for the Unicode character exists "
7397 "in the target code page.";
7398 /* each step cannot decode more than 1 character, but a character can be
7399 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007400 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007401 int insize;
7402 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 PyObject *errorHandler = NULL;
7404 PyObject *exc = NULL;
7405 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007406 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007407 DWORD err;
7408 int ret = -1;
7409
7410 assert(size > 0);
7411
7412 encoding = code_page_name(code_page, &encoding_obj);
7413 if (encoding == NULL)
7414 return -1;
7415
Victor Stinner7d00cc12014-03-17 23:08:06 +01007416 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7418 UnicodeDecodeError. */
7419 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7420 if (exc != NULL) {
7421 PyCodec_StrictErrors(exc);
7422 Py_CLEAR(exc);
7423 }
7424 goto error;
7425 }
7426
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007427 /* Extend a wchar_t* buffer */
7428 Py_ssize_t n = *bufsize; /* Get the current length */
7429 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7430 PyErr_NoMemory();
7431 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007433 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7434 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007435 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007436 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007437
7438 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 while (in < endin)
7440 {
7441 /* Decode a character */
7442 insize = 1;
7443 do
7444 {
7445 outsize = MultiByteToWideChar(code_page, flags,
7446 in, insize,
7447 buffer, Py_ARRAY_LENGTH(buffer));
7448 if (outsize > 0)
7449 break;
7450 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007451 if (err == ERROR_INVALID_FLAGS && flags) {
7452 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7453 flags = 0;
7454 continue;
7455 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 if (err != ERROR_NO_UNICODE_TRANSLATION
7457 && err != ERROR_INSUFFICIENT_BUFFER)
7458 {
7459 PyErr_SetFromWindowsErr(0);
7460 goto error;
7461 }
7462 insize++;
7463 }
7464 /* 4=maximum length of a UTF-8 sequence */
7465 while (insize <= 4 && (in + insize) <= endin);
7466
7467 if (outsize <= 0) {
7468 Py_ssize_t startinpos, endinpos, outpos;
7469
Victor Stinner7d00cc12014-03-17 23:08:06 +01007470 /* last character in partial decode? */
7471 if (in + insize >= endin && !final)
7472 break;
7473
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 startinpos = in - startin;
7475 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007476 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007477 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 errors, &errorHandler,
7479 encoding, reason,
7480 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007481 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007482 {
7483 goto error;
7484 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007485 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 }
7487 else {
7488 in += insize;
7489 memcpy(out, buffer, outsize * sizeof(wchar_t));
7490 out += outsize;
7491 }
7492 }
7493
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007494 /* Shrink the buffer */
7495 assert(out - *buf <= *bufsize);
7496 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007497 /* (in - startin) <= size and size is an int */
7498 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007499
7500error:
7501 Py_XDECREF(encoding_obj);
7502 Py_XDECREF(errorHandler);
7503 Py_XDECREF(exc);
7504 return ret;
7505}
7506
Victor Stinner3a50e702011-10-18 21:21:00 +02007507static PyObject *
7508decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007509 const char *s, Py_ssize_t size,
7510 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007511{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007512 wchar_t *buf = NULL;
7513 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007514 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007515
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 if (code_page < 0) {
7517 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7518 return NULL;
7519 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007520 if (size < 0) {
7521 PyErr_BadInternalCall();
7522 return NULL;
7523 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007524
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007525 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007527
Victor Stinner76a31a62011-11-04 00:05:13 +01007528 do
7529 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007530#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007531 if (size > DECODING_CHUNK_SIZE) {
7532 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007533 final = 0;
7534 done = 0;
7535 }
7536 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007537#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007538 {
7539 chunk_size = (int)size;
7540 final = (consumed == NULL);
7541 done = 1;
7542 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007543
Victor Stinner76a31a62011-11-04 00:05:13 +01007544 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007545 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007546 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007547 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007548 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007549
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007550 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007551 s, chunk_size);
7552 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007553 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007554 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007555 errors, final);
7556 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007557
7558 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007559 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007560 return NULL;
7561 }
7562
7563 if (consumed)
7564 *consumed += converted;
7565
7566 s += converted;
7567 size -= converted;
7568 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007569
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007570 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7571 PyMem_Free(buf);
7572 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007573}
7574
Alexander Belopolsky40018472011-02-26 01:02:56 +00007575PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007576PyUnicode_DecodeCodePageStateful(int code_page,
7577 const char *s,
7578 Py_ssize_t size,
7579 const char *errors,
7580 Py_ssize_t *consumed)
7581{
7582 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7583}
7584
7585PyObject *
7586PyUnicode_DecodeMBCSStateful(const char *s,
7587 Py_ssize_t size,
7588 const char *errors,
7589 Py_ssize_t *consumed)
7590{
7591 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7592}
7593
7594PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007595PyUnicode_DecodeMBCS(const char *s,
7596 Py_ssize_t size,
7597 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007598{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007599 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7600}
7601
Victor Stinner3a50e702011-10-18 21:21:00 +02007602static DWORD
7603encode_code_page_flags(UINT code_page, const char *errors)
7604{
7605 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007606 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 }
7608 else if (code_page == CP_UTF7) {
7609 /* CP_UTF7 only supports flags=0 */
7610 return 0;
7611 }
7612 else {
7613 if (errors != NULL && strcmp(errors, "replace") == 0)
7614 return 0;
7615 else
7616 return WC_NO_BEST_FIT_CHARS;
7617 }
7618}
7619
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007620/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 * Encode a Unicode string to a Windows code page into a byte string in strict
7622 * mode.
7623 *
7624 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007625 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007626 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007627static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007628encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007629 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007631{
Victor Stinner554f3f02010-06-16 23:33:54 +00007632 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007633 BOOL *pusedDefaultChar = &usedDefaultChar;
7634 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007635 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007636 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007637 const DWORD flags = encode_code_page_flags(code_page, NULL);
7638 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007639 /* Create a substring so that we can get the UTF-16 representation
7640 of just the slice under consideration. */
7641 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007642
Martin v. Löwis3d325192011-11-04 18:23:06 +01007643 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007644
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007646 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007648 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007649
Victor Stinner2fc507f2011-11-04 20:06:39 +01007650 substring = PyUnicode_Substring(unicode, offset, offset+len);
7651 if (substring == NULL)
7652 return -1;
7653 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7654 if (p == NULL) {
7655 Py_DECREF(substring);
7656 return -1;
7657 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007658 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007659
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007660 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007661 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007662 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007663 NULL, 0,
7664 NULL, pusedDefaultChar);
7665 if (outsize <= 0)
7666 goto error;
7667 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007668 if (pusedDefaultChar && *pusedDefaultChar) {
7669 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007670 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007671 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007672
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007675 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007676 if (*outbytes == NULL) {
7677 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007679 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007681 }
7682 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007684 const Py_ssize_t n = PyBytes_Size(*outbytes);
7685 if (outsize > PY_SSIZE_T_MAX - n) {
7686 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007687 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007689 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007690 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7691 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007692 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007693 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007695 }
7696
7697 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007698 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007699 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007700 out, outsize,
7701 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007702 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007703 if (outsize <= 0)
7704 goto error;
7705 if (pusedDefaultChar && *pusedDefaultChar)
7706 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007707 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007708
Victor Stinner3a50e702011-10-18 21:21:00 +02007709error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007710 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007711 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7712 return -2;
7713 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007714 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007715}
7716
Victor Stinner3a50e702011-10-18 21:21:00 +02007717/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007718 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007719 * error handler.
7720 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007721 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007722 * -1 on other error.
7723 */
7724static int
7725encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007726 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007727 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007728{
Victor Stinner3a50e702011-10-18 21:21:00 +02007729 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007730 Py_ssize_t pos = unicode_offset;
7731 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007732 /* Ideally, we should get reason from FormatMessage. This is the Windows
7733 2000 English version of the message. */
7734 const char *reason = "invalid character";
7735 /* 4=maximum length of a UTF-8 sequence */
7736 char buffer[4];
7737 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7738 Py_ssize_t outsize;
7739 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007740 PyObject *errorHandler = NULL;
7741 PyObject *exc = NULL;
7742 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007743 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007744 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007745 PyObject *rep;
7746 int ret = -1;
7747
7748 assert(insize > 0);
7749
7750 encoding = code_page_name(code_page, &encoding_obj);
7751 if (encoding == NULL)
7752 return -1;
7753
7754 if (errors == NULL || strcmp(errors, "strict") == 0) {
7755 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7756 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007757 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007758 if (exc != NULL) {
7759 PyCodec_StrictErrors(exc);
7760 Py_DECREF(exc);
7761 }
7762 Py_XDECREF(encoding_obj);
7763 return -1;
7764 }
7765
7766 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7767 pusedDefaultChar = &usedDefaultChar;
7768 else
7769 pusedDefaultChar = NULL;
7770
7771 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7772 PyErr_NoMemory();
7773 goto error;
7774 }
7775 outsize = insize * Py_ARRAY_LENGTH(buffer);
7776
7777 if (*outbytes == NULL) {
7778 /* Create string object */
7779 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7780 if (*outbytes == NULL)
7781 goto error;
7782 out = PyBytes_AS_STRING(*outbytes);
7783 }
7784 else {
7785 /* Extend string object */
7786 Py_ssize_t n = PyBytes_Size(*outbytes);
7787 if (n > PY_SSIZE_T_MAX - outsize) {
7788 PyErr_NoMemory();
7789 goto error;
7790 }
7791 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7792 goto error;
7793 out = PyBytes_AS_STRING(*outbytes) + n;
7794 }
7795
7796 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007797 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007798 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007799 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7800 wchar_t chars[2];
7801 int charsize;
7802 if (ch < 0x10000) {
7803 chars[0] = (wchar_t)ch;
7804 charsize = 1;
7805 }
7806 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007807 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7808 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007809 charsize = 2;
7810 }
7811
Victor Stinner3a50e702011-10-18 21:21:00 +02007812 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007813 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007814 buffer, Py_ARRAY_LENGTH(buffer),
7815 NULL, pusedDefaultChar);
7816 if (outsize > 0) {
7817 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7818 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007819 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007820 memcpy(out, buffer, outsize);
7821 out += outsize;
7822 continue;
7823 }
7824 }
7825 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7826 PyErr_SetFromWindowsErr(0);
7827 goto error;
7828 }
7829
Victor Stinner3a50e702011-10-18 21:21:00 +02007830 rep = unicode_encode_call_errorhandler(
7831 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007832 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007833 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007834 if (rep == NULL)
7835 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007836 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007837
7838 if (PyBytes_Check(rep)) {
7839 outsize = PyBytes_GET_SIZE(rep);
7840 if (outsize != 1) {
7841 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7842 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7843 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7844 Py_DECREF(rep);
7845 goto error;
7846 }
7847 out = PyBytes_AS_STRING(*outbytes) + offset;
7848 }
7849 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7850 out += outsize;
7851 }
7852 else {
7853 Py_ssize_t i;
7854 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007855 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007856
Benjamin Petersonbac79492012-01-14 13:34:47 -05007857 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007858 Py_DECREF(rep);
7859 goto error;
7860 }
7861
7862 outsize = PyUnicode_GET_LENGTH(rep);
7863 if (outsize != 1) {
7864 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7865 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7866 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7867 Py_DECREF(rep);
7868 goto error;
7869 }
7870 out = PyBytes_AS_STRING(*outbytes) + offset;
7871 }
7872 kind = PyUnicode_KIND(rep);
7873 data = PyUnicode_DATA(rep);
7874 for (i=0; i < outsize; i++) {
7875 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7876 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007877 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007878 encoding, unicode,
7879 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007880 "unable to encode error handler result to ASCII");
7881 Py_DECREF(rep);
7882 goto error;
7883 }
7884 *out = (unsigned char)ch;
7885 out++;
7886 }
7887 }
7888 Py_DECREF(rep);
7889 }
7890 /* write a NUL byte */
7891 *out = 0;
7892 outsize = out - PyBytes_AS_STRING(*outbytes);
7893 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7894 if (_PyBytes_Resize(outbytes, outsize) < 0)
7895 goto error;
7896 ret = 0;
7897
7898error:
7899 Py_XDECREF(encoding_obj);
7900 Py_XDECREF(errorHandler);
7901 Py_XDECREF(exc);
7902 return ret;
7903}
7904
Victor Stinner3a50e702011-10-18 21:21:00 +02007905static PyObject *
7906encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007907 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007908 const char *errors)
7909{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007910 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007911 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007912 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007913 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007914
Victor Stinner29dacf22015-01-26 16:41:32 +01007915 if (!PyUnicode_Check(unicode)) {
7916 PyErr_BadArgument();
7917 return NULL;
7918 }
7919
Benjamin Petersonbac79492012-01-14 13:34:47 -05007920 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007921 return NULL;
7922 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007923
Victor Stinner3a50e702011-10-18 21:21:00 +02007924 if (code_page < 0) {
7925 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7926 return NULL;
7927 }
7928
Martin v. Löwis3d325192011-11-04 18:23:06 +01007929 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007930 return PyBytes_FromStringAndSize(NULL, 0);
7931
Victor Stinner7581cef2011-11-03 22:32:33 +01007932 offset = 0;
7933 do
7934 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007935#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007936 if (len > DECODING_CHUNK_SIZE) {
7937 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007938 done = 0;
7939 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007940 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007941#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007942 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007943 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007944 done = 1;
7945 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007946
Victor Stinner76a31a62011-11-04 00:05:13 +01007947 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007948 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007949 errors);
7950 if (ret == -2)
7951 ret = encode_code_page_errors(code_page, &outbytes,
7952 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007953 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007954 if (ret < 0) {
7955 Py_XDECREF(outbytes);
7956 return NULL;
7957 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007958
Victor Stinner7581cef2011-11-03 22:32:33 +01007959 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007960 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007961 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007962
Victor Stinner3a50e702011-10-18 21:21:00 +02007963 return outbytes;
7964}
7965
7966PyObject *
7967PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7968 Py_ssize_t size,
7969 const char *errors)
7970{
Victor Stinner7581cef2011-11-03 22:32:33 +01007971 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007972 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007973 if (unicode == NULL)
7974 return NULL;
7975 res = encode_code_page(CP_ACP, unicode, errors);
7976 Py_DECREF(unicode);
7977 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007978}
7979
7980PyObject *
7981PyUnicode_EncodeCodePage(int code_page,
7982 PyObject *unicode,
7983 const char *errors)
7984{
Victor Stinner7581cef2011-11-03 22:32:33 +01007985 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007986}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007987
Alexander Belopolsky40018472011-02-26 01:02:56 +00007988PyObject *
7989PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007990{
Victor Stinner7581cef2011-11-03 22:32:33 +01007991 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007992}
7993
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007994#undef NEED_RETRY
7995
Steve Dowercc16be82016-09-08 10:35:16 -07007996#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007997
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998/* --- Character Mapping Codec -------------------------------------------- */
7999
Victor Stinnerfb161b12013-04-18 01:44:27 +02008000static int
8001charmap_decode_string(const char *s,
8002 Py_ssize_t size,
8003 PyObject *mapping,
8004 const char *errors,
8005 _PyUnicodeWriter *writer)
8006{
8007 const char *starts = s;
8008 const char *e;
8009 Py_ssize_t startinpos, endinpos;
8010 PyObject *errorHandler = NULL, *exc = NULL;
8011 Py_ssize_t maplen;
8012 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008013 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008014 Py_UCS4 x;
8015 unsigned char ch;
8016
8017 if (PyUnicode_READY(mapping) == -1)
8018 return -1;
8019
8020 maplen = PyUnicode_GET_LENGTH(mapping);
8021 mapdata = PyUnicode_DATA(mapping);
8022 mapkind = PyUnicode_KIND(mapping);
8023
8024 e = s + size;
8025
8026 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8027 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8028 * is disabled in encoding aliases, latin1 is preferred because
8029 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008030 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008031 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8032 Py_UCS4 maxchar = writer->maxchar;
8033
8034 assert (writer->kind == PyUnicode_1BYTE_KIND);
8035 while (s < e) {
8036 ch = *s;
8037 x = mapdata_ucs1[ch];
8038 if (x > maxchar) {
8039 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8040 goto onError;
8041 maxchar = writer->maxchar;
8042 outdata = (Py_UCS1 *)writer->data;
8043 }
8044 outdata[writer->pos] = x;
8045 writer->pos++;
8046 ++s;
8047 }
8048 return 0;
8049 }
8050
8051 while (s < e) {
8052 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8053 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008054 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008055 if (outkind == PyUnicode_1BYTE_KIND) {
8056 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8057 Py_UCS4 maxchar = writer->maxchar;
8058 while (s < e) {
8059 ch = *s;
8060 x = mapdata_ucs2[ch];
8061 if (x > maxchar)
8062 goto Error;
8063 outdata[writer->pos] = x;
8064 writer->pos++;
8065 ++s;
8066 }
8067 break;
8068 }
8069 else if (outkind == PyUnicode_2BYTE_KIND) {
8070 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8071 while (s < e) {
8072 ch = *s;
8073 x = mapdata_ucs2[ch];
8074 if (x == 0xFFFE)
8075 goto Error;
8076 outdata[writer->pos] = x;
8077 writer->pos++;
8078 ++s;
8079 }
8080 break;
8081 }
8082 }
8083 ch = *s;
8084
8085 if (ch < maplen)
8086 x = PyUnicode_READ(mapkind, mapdata, ch);
8087 else
8088 x = 0xfffe; /* invalid value */
8089Error:
8090 if (x == 0xfffe)
8091 {
8092 /* undefined mapping */
8093 startinpos = s-starts;
8094 endinpos = startinpos+1;
8095 if (unicode_decode_call_errorhandler_writer(
8096 errors, &errorHandler,
8097 "charmap", "character maps to <undefined>",
8098 &starts, &e, &startinpos, &endinpos, &exc, &s,
8099 writer)) {
8100 goto onError;
8101 }
8102 continue;
8103 }
8104
8105 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8106 goto onError;
8107 ++s;
8108 }
8109 Py_XDECREF(errorHandler);
8110 Py_XDECREF(exc);
8111 return 0;
8112
8113onError:
8114 Py_XDECREF(errorHandler);
8115 Py_XDECREF(exc);
8116 return -1;
8117}
8118
8119static int
8120charmap_decode_mapping(const char *s,
8121 Py_ssize_t size,
8122 PyObject *mapping,
8123 const char *errors,
8124 _PyUnicodeWriter *writer)
8125{
8126 const char *starts = s;
8127 const char *e;
8128 Py_ssize_t startinpos, endinpos;
8129 PyObject *errorHandler = NULL, *exc = NULL;
8130 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008131 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008132
8133 e = s + size;
8134
8135 while (s < e) {
8136 ch = *s;
8137
8138 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8139 key = PyLong_FromLong((long)ch);
8140 if (key == NULL)
8141 goto onError;
8142
8143 item = PyObject_GetItem(mapping, key);
8144 Py_DECREF(key);
8145 if (item == NULL) {
8146 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8147 /* No mapping found means: mapping is undefined. */
8148 PyErr_Clear();
8149 goto Undefined;
8150 } else
8151 goto onError;
8152 }
8153
8154 /* Apply mapping */
8155 if (item == Py_None)
8156 goto Undefined;
8157 if (PyLong_Check(item)) {
8158 long value = PyLong_AS_LONG(item);
8159 if (value == 0xFFFE)
8160 goto Undefined;
8161 if (value < 0 || value > MAX_UNICODE) {
8162 PyErr_Format(PyExc_TypeError,
8163 "character mapping must be in range(0x%lx)",
8164 (unsigned long)MAX_UNICODE + 1);
8165 goto onError;
8166 }
8167
8168 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8169 goto onError;
8170 }
8171 else if (PyUnicode_Check(item)) {
8172 if (PyUnicode_READY(item) == -1)
8173 goto onError;
8174 if (PyUnicode_GET_LENGTH(item) == 1) {
8175 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8176 if (value == 0xFFFE)
8177 goto Undefined;
8178 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8179 goto onError;
8180 }
8181 else {
8182 writer->overallocate = 1;
8183 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8184 goto onError;
8185 }
8186 }
8187 else {
8188 /* wrong return value */
8189 PyErr_SetString(PyExc_TypeError,
8190 "character mapping must return integer, None or str");
8191 goto onError;
8192 }
8193 Py_CLEAR(item);
8194 ++s;
8195 continue;
8196
8197Undefined:
8198 /* undefined mapping */
8199 Py_CLEAR(item);
8200 startinpos = s-starts;
8201 endinpos = startinpos+1;
8202 if (unicode_decode_call_errorhandler_writer(
8203 errors, &errorHandler,
8204 "charmap", "character maps to <undefined>",
8205 &starts, &e, &startinpos, &endinpos, &exc, &s,
8206 writer)) {
8207 goto onError;
8208 }
8209 }
8210 Py_XDECREF(errorHandler);
8211 Py_XDECREF(exc);
8212 return 0;
8213
8214onError:
8215 Py_XDECREF(item);
8216 Py_XDECREF(errorHandler);
8217 Py_XDECREF(exc);
8218 return -1;
8219}
8220
Alexander Belopolsky40018472011-02-26 01:02:56 +00008221PyObject *
8222PyUnicode_DecodeCharmap(const char *s,
8223 Py_ssize_t size,
8224 PyObject *mapping,
8225 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008227 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008228
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229 /* Default to Latin-1 */
8230 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008234 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008235 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008236 writer.min_length = size;
8237 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008239
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008240 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008241 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8242 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008243 }
8244 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008245 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008248 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008249
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008251 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 return NULL;
8253}
8254
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008255/* Charmap encoding: the lookup table */
8256
Alexander Belopolsky40018472011-02-26 01:02:56 +00008257struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 PyObject_HEAD
8259 unsigned char level1[32];
8260 int count2, count3;
8261 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008262};
8263
8264static PyObject*
8265encoding_map_size(PyObject *obj, PyObject* args)
8266{
8267 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008268 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008270}
8271
8272static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008273 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 PyDoc_STR("Return the size (in bytes) of this object") },
8275 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008276};
8277
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008278static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008279 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 "EncodingMap", /*tp_name*/
8281 sizeof(struct encoding_map), /*tp_basicsize*/
8282 0, /*tp_itemsize*/
8283 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008284 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008285 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 0, /*tp_getattr*/
8287 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008288 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 0, /*tp_repr*/
8290 0, /*tp_as_number*/
8291 0, /*tp_as_sequence*/
8292 0, /*tp_as_mapping*/
8293 0, /*tp_hash*/
8294 0, /*tp_call*/
8295 0, /*tp_str*/
8296 0, /*tp_getattro*/
8297 0, /*tp_setattro*/
8298 0, /*tp_as_buffer*/
8299 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8300 0, /*tp_doc*/
8301 0, /*tp_traverse*/
8302 0, /*tp_clear*/
8303 0, /*tp_richcompare*/
8304 0, /*tp_weaklistoffset*/
8305 0, /*tp_iter*/
8306 0, /*tp_iternext*/
8307 encoding_map_methods, /*tp_methods*/
8308 0, /*tp_members*/
8309 0, /*tp_getset*/
8310 0, /*tp_base*/
8311 0, /*tp_dict*/
8312 0, /*tp_descr_get*/
8313 0, /*tp_descr_set*/
8314 0, /*tp_dictoffset*/
8315 0, /*tp_init*/
8316 0, /*tp_alloc*/
8317 0, /*tp_new*/
8318 0, /*tp_free*/
8319 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320};
8321
8322PyObject*
8323PyUnicode_BuildEncodingMap(PyObject* string)
8324{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008325 PyObject *result;
8326 struct encoding_map *mresult;
8327 int i;
8328 int need_dict = 0;
8329 unsigned char level1[32];
8330 unsigned char level2[512];
8331 unsigned char *mlevel1, *mlevel2, *mlevel3;
8332 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008333 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008334 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008335 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008336 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008338 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008339 PyErr_BadArgument();
8340 return NULL;
8341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 kind = PyUnicode_KIND(string);
8343 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008344 length = PyUnicode_GET_LENGTH(string);
8345 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008346 memset(level1, 0xFF, sizeof level1);
8347 memset(level2, 0xFF, sizeof level2);
8348
8349 /* If there isn't a one-to-one mapping of NULL to \0,
8350 or if there are non-BMP characters, we need to use
8351 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008353 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008354 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008355 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 ch = PyUnicode_READ(kind, data, i);
8357 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008358 need_dict = 1;
8359 break;
8360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008362 /* unmapped character */
8363 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 l1 = ch >> 11;
8365 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008366 if (level1[l1] == 0xFF)
8367 level1[l1] = count2++;
8368 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008369 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008370 }
8371
8372 if (count2 >= 0xFF || count3 >= 0xFF)
8373 need_dict = 1;
8374
8375 if (need_dict) {
8376 PyObject *result = PyDict_New();
8377 PyObject *key, *value;
8378 if (!result)
8379 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008380 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008382 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008383 if (!key || !value)
8384 goto failed1;
8385 if (PyDict_SetItem(result, key, value) == -1)
8386 goto failed1;
8387 Py_DECREF(key);
8388 Py_DECREF(value);
8389 }
8390 return result;
8391 failed1:
8392 Py_XDECREF(key);
8393 Py_XDECREF(value);
8394 Py_DECREF(result);
8395 return NULL;
8396 }
8397
8398 /* Create a three-level trie */
8399 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8400 16*count2 + 128*count3 - 1);
8401 if (!result)
8402 return PyErr_NoMemory();
8403 PyObject_Init(result, &EncodingMapType);
8404 mresult = (struct encoding_map*)result;
8405 mresult->count2 = count2;
8406 mresult->count3 = count3;
8407 mlevel1 = mresult->level1;
8408 mlevel2 = mresult->level23;
8409 mlevel3 = mresult->level23 + 16*count2;
8410 memcpy(mlevel1, level1, 32);
8411 memset(mlevel2, 0xFF, 16*count2);
8412 memset(mlevel3, 0, 128*count3);
8413 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008414 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008415 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008416 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8417 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008418 /* unmapped character */
8419 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008420 o1 = ch>>11;
8421 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008422 i2 = 16*mlevel1[o1] + o2;
8423 if (mlevel2[i2] == 0xFF)
8424 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008425 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008426 i3 = 128*mlevel2[i2] + o3;
8427 mlevel3[i3] = i;
8428 }
8429 return result;
8430}
8431
8432static int
Victor Stinner22168992011-11-20 17:09:18 +01008433encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008434{
8435 struct encoding_map *map = (struct encoding_map*)mapping;
8436 int l1 = c>>11;
8437 int l2 = (c>>7) & 0xF;
8438 int l3 = c & 0x7F;
8439 int i;
8440
Victor Stinner22168992011-11-20 17:09:18 +01008441 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008443 if (c == 0)
8444 return 0;
8445 /* level 1*/
8446 i = map->level1[l1];
8447 if (i == 0xFF) {
8448 return -1;
8449 }
8450 /* level 2*/
8451 i = map->level23[16*i+l2];
8452 if (i == 0xFF) {
8453 return -1;
8454 }
8455 /* level 3 */
8456 i = map->level23[16*map->count2 + 128*i + l3];
8457 if (i == 0) {
8458 return -1;
8459 }
8460 return i;
8461}
8462
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008463/* Lookup the character ch in the mapping. If the character
8464 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008465 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008466static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008467charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468{
Christian Heimes217cfd12007-12-02 14:31:20 +00008469 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470 PyObject *x;
8471
8472 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474 x = PyObject_GetItem(mapping, w);
8475 Py_DECREF(w);
8476 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8478 /* No mapping found means: mapping is undefined. */
8479 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008480 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 } else
8482 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008484 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008486 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 long value = PyLong_AS_LONG(x);
8488 if (value < 0 || value > 255) {
8489 PyErr_SetString(PyExc_TypeError,
8490 "character mapping must be in range(256)");
8491 Py_DECREF(x);
8492 return NULL;
8493 }
8494 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008496 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 /* wrong return value */
8500 PyErr_Format(PyExc_TypeError,
8501 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008502 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 Py_DECREF(x);
8504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505 }
8506}
8507
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008508static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008509charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008510{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008511 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8512 /* exponentially overallocate to minimize reallocations */
8513 if (requiredsize < 2*outsize)
8514 requiredsize = 2*outsize;
8515 if (_PyBytes_Resize(outobj, requiredsize))
8516 return -1;
8517 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008518}
8519
Benjamin Peterson14339b62009-01-31 16:36:08 +00008520typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008522} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008524 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 space is available. Return a new reference to the object that
8526 was put in the output buffer, or Py_None, if the mapping was undefined
8527 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008528 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008529static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008530charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008531 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008533 PyObject *rep;
8534 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008535 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008536
Andy Lesterdffe4c02020-03-04 07:15:20 -06008537 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008538 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008540 if (res == -1)
8541 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 if (outsize<requiredsize)
8543 if (charmapencode_resize(outobj, outpos, requiredsize))
8544 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008545 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 outstart[(*outpos)++] = (char)res;
8547 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008548 }
8549
8550 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008553 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 Py_DECREF(rep);
8555 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008556 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 if (PyLong_Check(rep)) {
8558 Py_ssize_t requiredsize = *outpos+1;
8559 if (outsize<requiredsize)
8560 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8561 Py_DECREF(rep);
8562 return enc_EXCEPTION;
8563 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008564 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008566 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 else {
8568 const char *repchars = PyBytes_AS_STRING(rep);
8569 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8570 Py_ssize_t requiredsize = *outpos+repsize;
8571 if (outsize<requiredsize)
8572 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8573 Py_DECREF(rep);
8574 return enc_EXCEPTION;
8575 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008576 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 memcpy(outstart + *outpos, repchars, repsize);
8578 *outpos += repsize;
8579 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008580 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008581 Py_DECREF(rep);
8582 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583}
8584
8585/* handle an error in PyUnicode_EncodeCharmap
8586 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008587static int
8588charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008589 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008591 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008592 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593{
8594 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008595 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008596 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008597 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008598 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008599 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008601 Py_ssize_t collstartpos = *inpos;
8602 Py_ssize_t collendpos = *inpos+1;
8603 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008604 const char *encoding = "charmap";
8605 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008606 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008607 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008608 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008609
Benjamin Petersonbac79492012-01-14 13:34:47 -05008610 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008611 return -1;
8612 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613 /* find all unencodable characters */
8614 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008615 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008616 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008617 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008618 val = encoding_map_lookup(ch, mapping);
8619 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 break;
8621 ++collendpos;
8622 continue;
8623 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008624
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008625 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8626 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 if (rep==NULL)
8628 return -1;
8629 else if (rep!=Py_None) {
8630 Py_DECREF(rep);
8631 break;
8632 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008633 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 }
8636 /* cache callback name lookup
8637 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008638 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008639 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008640
8641 switch (*error_handler) {
8642 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008643 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008644 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008645
8646 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008647 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 x = charmapencode_output('?', mapping, res, respos);
8649 if (x==enc_EXCEPTION) {
8650 return -1;
8651 }
8652 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008653 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 return -1;
8655 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008656 }
8657 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008658 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008659 *inpos = collendpos;
8660 break;
Victor Stinner50149202015-09-22 00:26:54 +02008661
8662 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008663 /* generate replacement (temporarily (mis)uses p) */
8664 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 char buffer[2+29+1+1];
8666 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008667 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 for (cp = buffer; *cp; ++cp) {
8669 x = charmapencode_output(*cp, mapping, res, respos);
8670 if (x==enc_EXCEPTION)
8671 return -1;
8672 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008673 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 return -1;
8675 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008676 }
8677 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008678 *inpos = collendpos;
8679 break;
Victor Stinner50149202015-09-22 00:26:54 +02008680
Benjamin Peterson14339b62009-01-31 16:36:08 +00008681 default:
Victor Stinner50149202015-09-22 00:26:54 +02008682 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008683 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008685 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008687 if (PyBytes_Check(repunicode)) {
8688 /* Directly copy bytes result to output. */
8689 Py_ssize_t outsize = PyBytes_Size(*res);
8690 Py_ssize_t requiredsize;
8691 repsize = PyBytes_Size(repunicode);
8692 requiredsize = *respos + repsize;
8693 if (requiredsize > outsize)
8694 /* Make room for all additional bytes. */
8695 if (charmapencode_resize(res, respos, requiredsize)) {
8696 Py_DECREF(repunicode);
8697 return -1;
8698 }
8699 memcpy(PyBytes_AsString(*res) + *respos,
8700 PyBytes_AsString(repunicode), repsize);
8701 *respos += repsize;
8702 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008703 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008704 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008705 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008706 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008707 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008708 Py_DECREF(repunicode);
8709 return -1;
8710 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008711 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008712 data = PyUnicode_DATA(repunicode);
8713 kind = PyUnicode_KIND(repunicode);
8714 for (index = 0; index < repsize; index++) {
8715 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8716 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008718 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 return -1;
8720 }
8721 else if (x==enc_FAILED) {
8722 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008723 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 return -1;
8725 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008726 }
8727 *inpos = newpos;
8728 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008729 }
8730 return 0;
8731}
8732
Alexander Belopolsky40018472011-02-26 01:02:56 +00008733PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008734_PyUnicode_EncodeCharmap(PyObject *unicode,
8735 PyObject *mapping,
8736 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738 /* output object */
8739 PyObject *res = NULL;
8740 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008741 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008742 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008743 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008744 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008745 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008747 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008748 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008749 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750
Benjamin Petersonbac79492012-01-14 13:34:47 -05008751 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008752 return NULL;
8753 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008754 data = PyUnicode_DATA(unicode);
8755 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008756
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757 /* Default to Latin-1 */
8758 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008759 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008761 /* allocate enough for a simple encoding without
8762 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008763 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764 if (res == NULL)
8765 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008766 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008769 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008770 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008772 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 if (x==enc_EXCEPTION) /* error */
8774 goto onError;
8775 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008776 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008778 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 &res, &respos)) {
8780 goto onError;
8781 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008782 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 else
8784 /* done with this character => adjust input position */
8785 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008788 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008789 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008790 if (_PyBytes_Resize(&res, respos) < 0)
8791 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008792
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008793 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008794 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008795 return res;
8796
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008798 Py_XDECREF(res);
8799 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008800 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 return NULL;
8802}
8803
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008804/* Deprecated */
8805PyObject *
8806PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8807 Py_ssize_t size,
8808 PyObject *mapping,
8809 const char *errors)
8810{
8811 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008812 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008813 if (unicode == NULL)
8814 return NULL;
8815 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8816 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008817 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008818}
8819
Alexander Belopolsky40018472011-02-26 01:02:56 +00008820PyObject *
8821PyUnicode_AsCharmapString(PyObject *unicode,
8822 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823{
8824 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 PyErr_BadArgument();
8826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008828 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829}
8830
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008831/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008832static void
8833make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008835 Py_ssize_t startpos, Py_ssize_t endpos,
8836 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008838 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 *exceptionObject = _PyUnicodeTranslateError_Create(
8840 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841 }
8842 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8844 goto onError;
8845 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8846 goto onError;
8847 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8848 goto onError;
8849 return;
8850 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008851 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 }
8853}
8854
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008855/* error handling callback helper:
8856 build arguments, call the callback and check the arguments,
8857 put the result into newpos and return the replacement string, which
8858 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008859static PyObject *
8860unicode_translate_call_errorhandler(const char *errors,
8861 PyObject **errorHandler,
8862 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008864 Py_ssize_t startpos, Py_ssize_t endpos,
8865 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008866{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008867 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008868
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008869 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870 PyObject *restuple;
8871 PyObject *resunicode;
8872
8873 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008875 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008877 }
8878
8879 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008881 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008883
Petr Viktorinffd97532020-02-11 17:46:57 +01008884 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008885 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008887 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008888 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 Py_DECREF(restuple);
8890 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008891 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008892 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 &resunicode, &i_newpos)) {
8894 Py_DECREF(restuple);
8895 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008896 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008897 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008899 else
8900 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008902 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 Py_DECREF(restuple);
8904 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008905 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008906 Py_INCREF(resunicode);
8907 Py_DECREF(restuple);
8908 return resunicode;
8909}
8910
8911/* Lookup the character ch in the mapping and put the result in result,
8912 which must be decrefed by the caller.
8913 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008914static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008916{
Christian Heimes217cfd12007-12-02 14:31:20 +00008917 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008918 PyObject *x;
8919
8920 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008922 x = PyObject_GetItem(mapping, w);
8923 Py_DECREF(w);
8924 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8926 /* No mapping found means: use 1:1 mapping. */
8927 PyErr_Clear();
8928 *result = NULL;
8929 return 0;
8930 } else
8931 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008932 }
8933 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 *result = x;
8935 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008936 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008937 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008939 if (value < 0 || value > MAX_UNICODE) {
8940 PyErr_Format(PyExc_ValueError,
8941 "character mapping must be in range(0x%x)",
8942 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 Py_DECREF(x);
8944 return -1;
8945 }
8946 *result = x;
8947 return 0;
8948 }
8949 else if (PyUnicode_Check(x)) {
8950 *result = x;
8951 return 0;
8952 }
8953 else {
8954 /* wrong return value */
8955 PyErr_SetString(PyExc_TypeError,
8956 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008957 Py_DECREF(x);
8958 return -1;
8959 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008960}
Victor Stinner1194ea02014-04-04 19:37:40 +02008961
8962/* lookup the character, write the result into the writer.
8963 Return 1 if the result was written into the writer, return 0 if the mapping
8964 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008965static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008966charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8967 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008968{
Victor Stinner1194ea02014-04-04 19:37:40 +02008969 PyObject *item;
8970
8971 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008973
8974 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008976 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008979 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008980 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008981
8982 if (item == Py_None) {
8983 Py_DECREF(item);
8984 return 0;
8985 }
8986
8987 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008988 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8989 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8990 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008991 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8992 Py_DECREF(item);
8993 return -1;
8994 }
8995 Py_DECREF(item);
8996 return 1;
8997 }
8998
8999 if (!PyUnicode_Check(item)) {
9000 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009002 }
9003
9004 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9005 Py_DECREF(item);
9006 return -1;
9007 }
9008
9009 Py_DECREF(item);
9010 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009011}
9012
Victor Stinner89a76ab2014-04-05 11:44:04 +02009013static int
9014unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9015 Py_UCS1 *translate)
9016{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009017 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009018 int ret = 0;
9019
Victor Stinner89a76ab2014-04-05 11:44:04 +02009020 if (charmaptranslate_lookup(ch, mapping, &item)) {
9021 return -1;
9022 }
9023
9024 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009025 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009026 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009027 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009028 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009029 /* not found => default to 1:1 mapping */
9030 translate[ch] = ch;
9031 return 1;
9032 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009033 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009034 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009035 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9036 used it */
9037 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009038 /* invalid character or character outside ASCII:
9039 skip the fast translate */
9040 goto exit;
9041 }
9042 translate[ch] = (Py_UCS1)replace;
9043 }
9044 else if (PyUnicode_Check(item)) {
9045 Py_UCS4 replace;
9046
9047 if (PyUnicode_READY(item) == -1) {
9048 Py_DECREF(item);
9049 return -1;
9050 }
9051 if (PyUnicode_GET_LENGTH(item) != 1)
9052 goto exit;
9053
9054 replace = PyUnicode_READ_CHAR(item, 0);
9055 if (replace > 127)
9056 goto exit;
9057 translate[ch] = (Py_UCS1)replace;
9058 }
9059 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009060 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009061 goto exit;
9062 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009063 ret = 1;
9064
Benjamin Peterson1365de72014-04-07 20:15:41 -04009065 exit:
9066 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009067 return ret;
9068}
9069
9070/* Fast path for ascii => ascii translation. Return 1 if the whole string
9071 was translated into writer, return 0 if the input string was partially
9072 translated into writer, raise an exception and return -1 on error. */
9073static int
9074unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009075 _PyUnicodeWriter *writer, int ignore,
9076 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009077{
Victor Stinner872b2912014-04-05 14:27:07 +02009078 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009079 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009080 const Py_UCS1 *in, *end;
9081 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009082 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009083
Victor Stinner89a76ab2014-04-05 11:44:04 +02009084 len = PyUnicode_GET_LENGTH(input);
9085
Victor Stinner872b2912014-04-05 14:27:07 +02009086 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009087
9088 in = PyUnicode_1BYTE_DATA(input);
9089 end = in + len;
9090
9091 assert(PyUnicode_IS_ASCII(writer->buffer));
9092 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9093 out = PyUnicode_1BYTE_DATA(writer->buffer);
9094
Victor Stinner872b2912014-04-05 14:27:07 +02009095 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009096 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009097 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009098 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009099 int translate = unicode_fast_translate_lookup(mapping, ch,
9100 ascii_table);
9101 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009102 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009103 if (translate == 0)
9104 goto exit;
9105 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009106 }
Victor Stinner872b2912014-04-05 14:27:07 +02009107 if (ch2 == 0xfe) {
9108 if (ignore)
9109 continue;
9110 goto exit;
9111 }
9112 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009113 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009114 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009115 }
Victor Stinner872b2912014-04-05 14:27:07 +02009116 res = 1;
9117
9118exit:
9119 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009120 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009121 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009122}
9123
Victor Stinner3222da22015-10-01 22:07:32 +02009124static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125_PyUnicode_TranslateCharmap(PyObject *input,
9126 PyObject *mapping,
9127 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009129 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009130 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 Py_ssize_t size, i;
9132 int kind;
9133 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009134 _PyUnicodeWriter writer;
9135 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009136 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009137 PyObject *errorHandler = NULL;
9138 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009139 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009140 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009141
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 PyErr_BadArgument();
9144 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147 if (PyUnicode_READY(input) == -1)
9148 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009149 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 kind = PyUnicode_KIND(input);
9151 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009153 if (size == 0)
9154 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009156 /* allocate enough for a simple 1:1 translation without
9157 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009158 _PyUnicodeWriter_Init(&writer);
9159 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161
Victor Stinner872b2912014-04-05 14:27:07 +02009162 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9163
Victor Stinner33798672016-03-01 21:59:58 +01009164 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009165 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009166 if (PyUnicode_IS_ASCII(input)) {
9167 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9168 if (res < 0) {
9169 _PyUnicodeWriter_Dealloc(&writer);
9170 return NULL;
9171 }
9172 if (res == 1)
9173 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009174 }
Victor Stinner33798672016-03-01 21:59:58 +01009175 else {
9176 i = 0;
9177 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009181 int translate;
9182 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9183 Py_ssize_t newpos;
9184 /* startpos for collecting untranslatable chars */
9185 Py_ssize_t collstart;
9186 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009187 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188
Victor Stinner1194ea02014-04-04 19:37:40 +02009189 ch = PyUnicode_READ(kind, data, i);
9190 translate = charmaptranslate_output(ch, mapping, &writer);
9191 if (translate < 0)
9192 goto onError;
9193
9194 if (translate != 0) {
9195 /* it worked => adjust input pointer */
9196 ++i;
9197 continue;
9198 }
9199
9200 /* untranslatable character */
9201 collstart = i;
9202 collend = i+1;
9203
9204 /* find all untranslatable characters */
9205 while (collend < size) {
9206 PyObject *x;
9207 ch = PyUnicode_READ(kind, data, collend);
9208 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009209 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009210 Py_XDECREF(x);
9211 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009212 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009213 ++collend;
9214 }
9215
9216 if (ignore) {
9217 i = collend;
9218 }
9219 else {
9220 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9221 reason, input, &exc,
9222 collstart, collend, &newpos);
9223 if (repunicode == NULL)
9224 goto onError;
9225 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009226 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009227 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009228 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009229 Py_DECREF(repunicode);
9230 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009231 }
9232 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009233 Py_XDECREF(exc);
9234 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009235 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236
Benjamin Peterson29060642009-01-31 22:14:21 +00009237 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009238 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009239 Py_XDECREF(exc);
9240 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241 return NULL;
9242}
9243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244/* Deprecated. Use PyUnicode_Translate instead. */
9245PyObject *
9246PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9247 Py_ssize_t size,
9248 PyObject *mapping,
9249 const char *errors)
9250{
Christian Heimes5f520f42012-09-11 14:03:25 +02009251 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009252 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253 if (!unicode)
9254 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009255 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9256 Py_DECREF(unicode);
9257 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258}
9259
Alexander Belopolsky40018472011-02-26 01:02:56 +00009260PyObject *
9261PyUnicode_Translate(PyObject *str,
9262 PyObject *mapping,
9263 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009265 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009266 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009267 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268}
Tim Petersced69f82003-09-16 20:30:58 +00009269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270PyObject *
9271_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9272{
9273 if (!PyUnicode_Check(unicode)) {
9274 PyErr_BadInternalCall();
9275 return NULL;
9276 }
9277 if (PyUnicode_READY(unicode) == -1)
9278 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009279 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 /* If the string is already ASCII, just return the same string */
9281 Py_INCREF(unicode);
9282 return unicode;
9283 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009284
9285 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9286 PyObject *result = PyUnicode_New(len, 127);
9287 if (result == NULL) {
9288 return NULL;
9289 }
9290
9291 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9292 int kind = PyUnicode_KIND(unicode);
9293 const void *data = PyUnicode_DATA(unicode);
9294 Py_ssize_t i;
9295 for (i = 0; i < len; ++i) {
9296 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9297 if (ch < 127) {
9298 out[i] = ch;
9299 }
9300 else if (Py_UNICODE_ISSPACE(ch)) {
9301 out[i] = ' ';
9302 }
9303 else {
9304 int decimal = Py_UNICODE_TODECIMAL(ch);
9305 if (decimal < 0) {
9306 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009307 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009308 _PyUnicode_LENGTH(result) = i + 1;
9309 break;
9310 }
9311 out[i] = '0' + decimal;
9312 }
9313 }
9314
INADA Naoki16dfca42018-07-14 12:06:43 +09009315 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009316 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317}
9318
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009319PyObject *
9320PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9321 Py_ssize_t length)
9322{
Victor Stinnerf0124502011-11-21 23:12:56 +01009323 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009324 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009325 Py_UCS4 maxchar;
9326 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009327 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009328
Victor Stinner99d7ad02012-02-22 13:37:39 +01009329 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009330 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009331 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009332 if (ch > 127) {
9333 int decimal = Py_UNICODE_TODECIMAL(ch);
9334 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009335 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009336 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009337 }
9338 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009339
9340 /* Copy to a new string */
9341 decimal = PyUnicode_New(length, maxchar);
9342 if (decimal == NULL)
9343 return decimal;
9344 kind = PyUnicode_KIND(decimal);
9345 data = PyUnicode_DATA(decimal);
9346 /* Iterate over code points */
9347 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009348 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009349 if (ch > 127) {
9350 int decimal = Py_UNICODE_TODECIMAL(ch);
9351 if (decimal >= 0)
9352 ch = '0' + decimal;
9353 }
9354 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009356 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009357}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009358/* --- Decimal Encoder ---------------------------------------------------- */
9359
Alexander Belopolsky40018472011-02-26 01:02:56 +00009360int
9361PyUnicode_EncodeDecimal(Py_UNICODE *s,
9362 Py_ssize_t length,
9363 char *output,
9364 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009365{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009366 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009367 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009368 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009369 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009370
9371 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 PyErr_BadArgument();
9373 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009374 }
9375
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009376 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009377 if (unicode == NULL)
9378 return -1;
9379
Victor Stinner42bf7752011-11-21 22:52:58 +01009380 kind = PyUnicode_KIND(unicode);
9381 data = PyUnicode_DATA(unicode);
9382
Victor Stinnerb84d7232011-11-22 01:50:07 +01009383 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009384 PyObject *exc;
9385 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009386 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009387 Py_ssize_t startpos;
9388
9389 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009390
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009392 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009393 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009395 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009396 decimal = Py_UNICODE_TODECIMAL(ch);
9397 if (decimal >= 0) {
9398 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009399 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009400 continue;
9401 }
9402 if (0 < ch && ch < 256) {
9403 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009404 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009405 continue;
9406 }
Victor Stinner6345be92011-11-25 20:09:01 +01009407
Victor Stinner42bf7752011-11-21 22:52:58 +01009408 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009409 exc = NULL;
9410 raise_encode_exception(&exc, "decimal", unicode,
9411 startpos, startpos+1,
9412 "invalid decimal Unicode string");
9413 Py_XDECREF(exc);
9414 Py_DECREF(unicode);
9415 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009416 }
9417 /* 0-terminate the output string */
9418 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009419 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009420 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009421}
9422
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423/* --- Helpers ------------------------------------------------------------ */
9424
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009425/* helper macro to fixup start/end slice values */
9426#define ADJUST_INDICES(start, end, len) \
9427 if (end > len) \
9428 end = len; \
9429 else if (end < 0) { \
9430 end += len; \
9431 if (end < 0) \
9432 end = 0; \
9433 } \
9434 if (start < 0) { \
9435 start += len; \
9436 if (start < 0) \
9437 start = 0; \
9438 }
9439
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009441any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009443 Py_ssize_t end,
9444 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009446 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009447 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 Py_ssize_t len1, len2, result;
9449
9450 kind1 = PyUnicode_KIND(s1);
9451 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009452 if (kind1 < kind2)
9453 return -1;
9454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 len1 = PyUnicode_GET_LENGTH(s1);
9456 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009457 ADJUST_INDICES(start, end, len1);
9458 if (end - start < len2)
9459 return -1;
9460
9461 buf1 = PyUnicode_DATA(s1);
9462 buf2 = PyUnicode_DATA(s2);
9463 if (len2 == 1) {
9464 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9465 result = findchar((const char *)buf1 + kind1*start,
9466 kind1, end - start, ch, direction);
9467 if (result == -1)
9468 return -1;
9469 else
9470 return start + result;
9471 }
9472
9473 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009474 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009475 if (!buf2)
9476 return -2;
9477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478
Victor Stinner794d5672011-10-10 03:21:36 +02009479 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009480 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009481 case PyUnicode_1BYTE_KIND:
9482 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9483 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9484 else
9485 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9486 break;
9487 case PyUnicode_2BYTE_KIND:
9488 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9489 break;
9490 case PyUnicode_4BYTE_KIND:
9491 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9492 break;
9493 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009494 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009495 }
9496 }
9497 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009498 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009499 case PyUnicode_1BYTE_KIND:
9500 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9501 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9502 else
9503 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9504 break;
9505 case PyUnicode_2BYTE_KIND:
9506 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9507 break;
9508 case PyUnicode_4BYTE_KIND:
9509 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9510 break;
9511 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009512 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009513 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 }
9515
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009516 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009517 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009518 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519
9520 return result;
9521}
9522
Victor Stinner59423e32018-11-26 13:40:01 +01009523/* _PyUnicode_InsertThousandsGrouping() helper functions */
9524#include "stringlib/localeutil.h"
9525
9526/**
9527 * InsertThousandsGrouping:
9528 * @writer: Unicode writer.
9529 * @n_buffer: Number of characters in @buffer.
9530 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9531 * @d_pos: Start of digits string.
9532 * @n_digits: The number of digits in the string, in which we want
9533 * to put the grouping chars.
9534 * @min_width: The minimum width of the digits in the output string.
9535 * Output will be zero-padded on the left to fill.
9536 * @grouping: see definition in localeconv().
9537 * @thousands_sep: see definition in localeconv().
9538 *
9539 * There are 2 modes: counting and filling. If @writer is NULL,
9540 * we are in counting mode, else filling mode.
9541 * If counting, the required buffer size is returned.
9542 * If filling, we know the buffer will be large enough, so we don't
9543 * need to pass in the buffer size.
9544 * Inserts thousand grouping characters (as defined by grouping and
9545 * thousands_sep) into @writer.
9546 *
9547 * Return value: -1 on error, number of characters otherwise.
9548 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009550_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009551 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009552 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009553 PyObject *digits,
9554 Py_ssize_t d_pos,
9555 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009556 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009557 const char *grouping,
9558 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009559 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560{
Xtreak3f7983a2019-01-07 20:39:14 +05309561 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009562 if (writer) {
9563 assert(digits != NULL);
9564 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009565 }
9566 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009567 assert(digits == NULL);
9568 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009569 }
Victor Stinner59423e32018-11-26 13:40:01 +01009570 assert(0 <= d_pos);
9571 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009572 assert(grouping != NULL);
9573
9574 if (digits != NULL) {
9575 if (PyUnicode_READY(digits) == -1) {
9576 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009577 }
Victor Stinner59423e32018-11-26 13:40:01 +01009578 }
9579 if (PyUnicode_READY(thousands_sep) == -1) {
9580 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009581 }
9582
Victor Stinner59423e32018-11-26 13:40:01 +01009583 Py_ssize_t count = 0;
9584 Py_ssize_t n_zeros;
9585 int loop_broken = 0;
9586 int use_separator = 0; /* First time through, don't append the
9587 separator. They only go between
9588 groups. */
9589 Py_ssize_t buffer_pos;
9590 Py_ssize_t digits_pos;
9591 Py_ssize_t len;
9592 Py_ssize_t n_chars;
9593 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9594 be looked at */
9595 /* A generator that returns all of the grouping widths, until it
9596 returns 0. */
9597 GroupGenerator groupgen;
9598 GroupGenerator_init(&groupgen, grouping);
9599 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9600
9601 /* if digits are not grouped, thousands separator
9602 should be an empty string */
9603 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9604
9605 digits_pos = d_pos + n_digits;
9606 if (writer) {
9607 buffer_pos = writer->pos + n_buffer;
9608 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9609 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 }
Victor Stinner59423e32018-11-26 13:40:01 +01009611 else {
9612 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009613 }
Victor Stinner59423e32018-11-26 13:40:01 +01009614
9615 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009616 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009617 }
Victor Stinner59423e32018-11-26 13:40:01 +01009618
9619 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9620 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9621 n_zeros = Py_MAX(0, len - remaining);
9622 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9623
9624 /* Use n_zero zero's and n_chars chars */
9625
9626 /* Count only, don't do anything. */
9627 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9628
9629 /* Copy into the writer. */
9630 InsertThousandsGrouping_fill(writer, &buffer_pos,
9631 digits, &digits_pos,
9632 n_chars, n_zeros,
9633 use_separator ? thousands_sep : NULL,
9634 thousands_sep_len, maxchar);
9635
9636 /* Use a separator next time. */
9637 use_separator = 1;
9638
9639 remaining -= n_chars;
9640 min_width -= len;
9641
9642 if (remaining <= 0 && min_width <= 0) {
9643 loop_broken = 1;
9644 break;
9645 }
9646 min_width -= thousands_sep_len;
9647 }
9648 if (!loop_broken) {
9649 /* We left the loop without using a break statement. */
9650
9651 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9652 n_zeros = Py_MAX(0, len - remaining);
9653 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9654
9655 /* Use n_zero zero's and n_chars chars */
9656 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9657
9658 /* Copy into the writer. */
9659 InsertThousandsGrouping_fill(writer, &buffer_pos,
9660 digits, &digits_pos,
9661 n_chars, n_zeros,
9662 use_separator ? thousands_sep : NULL,
9663 thousands_sep_len, maxchar);
9664 }
9665 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666}
9667
9668
Alexander Belopolsky40018472011-02-26 01:02:56 +00009669Py_ssize_t
9670PyUnicode_Count(PyObject *str,
9671 PyObject *substr,
9672 Py_ssize_t start,
9673 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009675 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009676 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009677 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009679
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009680 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009681 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009682
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009683 kind1 = PyUnicode_KIND(str);
9684 kind2 = PyUnicode_KIND(substr);
9685 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009686 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009687
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009688 len1 = PyUnicode_GET_LENGTH(str);
9689 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009691 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009692 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009693
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009694 buf1 = PyUnicode_DATA(str);
9695 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009696 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009697 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009698 if (!buf2)
9699 goto onError;
9700 }
9701
9702 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009704 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009705 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009706 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009707 buf2, len2, PY_SSIZE_T_MAX
9708 );
9709 else
9710 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009711 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009712 buf2, len2, PY_SSIZE_T_MAX
9713 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 break;
9715 case PyUnicode_2BYTE_KIND:
9716 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009717 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 buf2, len2, PY_SSIZE_T_MAX
9719 );
9720 break;
9721 case PyUnicode_4BYTE_KIND:
9722 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009723 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 buf2, len2, PY_SSIZE_T_MAX
9725 );
9726 break;
9727 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009728 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009730
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009731 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009732 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009733 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009737 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9738 if (kind2 != kind1)
9739 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741}
9742
Alexander Belopolsky40018472011-02-26 01:02:56 +00009743Py_ssize_t
9744PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009745 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009746 Py_ssize_t start,
9747 Py_ssize_t end,
9748 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009750 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009751 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009752
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009753 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754}
9755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756Py_ssize_t
9757PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9758 Py_ssize_t start, Py_ssize_t end,
9759 int direction)
9760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009762 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 if (PyUnicode_READY(str) == -1)
9764 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009765 len = PyUnicode_GET_LENGTH(str);
9766 ADJUST_INDICES(start, end, len);
9767 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009768 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009770 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9771 kind, end-start, ch, direction);
9772 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009774 else
9775 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776}
9777
Alexander Belopolsky40018472011-02-26 01:02:56 +00009778static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009779tailmatch(PyObject *self,
9780 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009781 Py_ssize_t start,
9782 Py_ssize_t end,
9783 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 int kind_self;
9786 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009787 const void *data_self;
9788 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 Py_ssize_t offset;
9790 Py_ssize_t i;
9791 Py_ssize_t end_sub;
9792
9793 if (PyUnicode_READY(self) == -1 ||
9794 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009795 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9798 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009800 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009802 if (PyUnicode_GET_LENGTH(substring) == 0)
9803 return 1;
9804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 kind_self = PyUnicode_KIND(self);
9806 data_self = PyUnicode_DATA(self);
9807 kind_sub = PyUnicode_KIND(substring);
9808 data_sub = PyUnicode_DATA(substring);
9809 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9810
9811 if (direction > 0)
9812 offset = end;
9813 else
9814 offset = start;
9815
9816 if (PyUnicode_READ(kind_self, data_self, offset) ==
9817 PyUnicode_READ(kind_sub, data_sub, 0) &&
9818 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9819 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9820 /* If both are of the same kind, memcmp is sufficient */
9821 if (kind_self == kind_sub) {
9822 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009823 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 data_sub,
9825 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009826 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009828 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 else {
9830 /* We do not need to compare 0 and len(substring)-1 because
9831 the if statement above ensured already that they are equal
9832 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 for (i = 1; i < end_sub; ++i) {
9834 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9835 PyUnicode_READ(kind_sub, data_sub, i))
9836 return 0;
9837 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009838 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840 }
9841
9842 return 0;
9843}
9844
Alexander Belopolsky40018472011-02-26 01:02:56 +00009845Py_ssize_t
9846PyUnicode_Tailmatch(PyObject *str,
9847 PyObject *substr,
9848 Py_ssize_t start,
9849 Py_ssize_t end,
9850 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009852 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009853 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009854
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009855 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856}
9857
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009858static PyObject *
9859ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009861 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009862 const char *data = PyUnicode_DATA(self);
9863 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009864 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009865
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009866 res = PyUnicode_New(len, 127);
9867 if (res == NULL)
9868 return NULL;
9869 resdata = PyUnicode_DATA(res);
9870 if (lower)
9871 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009873 _Py_bytes_upper(resdata, data, len);
9874 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875}
9876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009877static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009878handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009880 Py_ssize_t j;
9881 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009882 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009883 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009884
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009885 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9886
9887 where ! is a negation and \p{xxx} is a character with property xxx.
9888 */
9889 for (j = i - 1; j >= 0; j--) {
9890 c = PyUnicode_READ(kind, data, j);
9891 if (!_PyUnicode_IsCaseIgnorable(c))
9892 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009894 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9895 if (final_sigma) {
9896 for (j = i + 1; j < length; j++) {
9897 c = PyUnicode_READ(kind, data, j);
9898 if (!_PyUnicode_IsCaseIgnorable(c))
9899 break;
9900 }
9901 final_sigma = j == length || !_PyUnicode_IsCased(c);
9902 }
9903 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904}
9905
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009906static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009907lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009908 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009910 /* Obscure special case. */
9911 if (c == 0x3A3) {
9912 mapped[0] = handle_capital_sigma(kind, data, length, i);
9913 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009915 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916}
9917
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009918static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009919do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009921 Py_ssize_t i, k = 0;
9922 int n_res, j;
9923 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009924
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009925 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009926 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009927 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009928 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009929 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009931 for (i = 1; i < length; i++) {
9932 c = PyUnicode_READ(kind, data, i);
9933 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9934 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009935 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009936 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009937 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009938 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009939 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940}
9941
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009942static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009943do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009944 Py_ssize_t i, k = 0;
9945
9946 for (i = 0; i < length; i++) {
9947 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9948 int n_res, j;
9949 if (Py_UNICODE_ISUPPER(c)) {
9950 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9951 }
9952 else if (Py_UNICODE_ISLOWER(c)) {
9953 n_res = _PyUnicode_ToUpperFull(c, mapped);
9954 }
9955 else {
9956 n_res = 1;
9957 mapped[0] = c;
9958 }
9959 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009960 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009961 res[k++] = mapped[j];
9962 }
9963 }
9964 return k;
9965}
9966
9967static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009968do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009969 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009971 Py_ssize_t i, k = 0;
9972
9973 for (i = 0; i < length; i++) {
9974 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9975 int n_res, j;
9976 if (lower)
9977 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9978 else
9979 n_res = _PyUnicode_ToUpperFull(c, mapped);
9980 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009981 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009982 res[k++] = mapped[j];
9983 }
9984 }
9985 return k;
9986}
9987
9988static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009989do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009990{
9991 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9992}
9993
9994static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009995do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009996{
9997 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9998}
9999
Benjamin Petersone51757f2012-01-12 21:10:29 -050010000static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010001do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010002{
10003 Py_ssize_t i, k = 0;
10004
10005 for (i = 0; i < length; i++) {
10006 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10007 Py_UCS4 mapped[3];
10008 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10009 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010010 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010011 res[k++] = mapped[j];
10012 }
10013 }
10014 return k;
10015}
10016
10017static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010018do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010019{
10020 Py_ssize_t i, k = 0;
10021 int previous_is_cased;
10022
10023 previous_is_cased = 0;
10024 for (i = 0; i < length; i++) {
10025 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10026 Py_UCS4 mapped[3];
10027 int n_res, j;
10028
10029 if (previous_is_cased)
10030 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10031 else
10032 n_res = _PyUnicode_ToTitleFull(c, mapped);
10033
10034 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010035 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010036 res[k++] = mapped[j];
10037 }
10038
10039 previous_is_cased = _PyUnicode_IsCased(c);
10040 }
10041 return k;
10042}
10043
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010044static PyObject *
10045case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010046 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010047{
10048 PyObject *res = NULL;
10049 Py_ssize_t length, newlength = 0;
10050 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010051 const void *data;
10052 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010053 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10054
Benjamin Petersoneea48462012-01-16 14:28:50 -050010055 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010056
10057 kind = PyUnicode_KIND(self);
10058 data = PyUnicode_DATA(self);
10059 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010060 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010061 PyErr_SetString(PyExc_OverflowError, "string is too long");
10062 return NULL;
10063 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010064 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010065 if (tmp == NULL)
10066 return PyErr_NoMemory();
10067 newlength = perform(kind, data, length, tmp, &maxchar);
10068 res = PyUnicode_New(newlength, maxchar);
10069 if (res == NULL)
10070 goto leave;
10071 tmpend = tmp + newlength;
10072 outdata = PyUnicode_DATA(res);
10073 outkind = PyUnicode_KIND(res);
10074 switch (outkind) {
10075 case PyUnicode_1BYTE_KIND:
10076 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10077 break;
10078 case PyUnicode_2BYTE_KIND:
10079 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10080 break;
10081 case PyUnicode_4BYTE_KIND:
10082 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10083 break;
10084 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010085 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010086 }
10087 leave:
10088 PyMem_FREE(tmp);
10089 return res;
10090}
10091
Tim Peters8ce9f162004-08-27 01:49:32 +000010092PyObject *
10093PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010094{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010095 PyObject *res;
10096 PyObject *fseq;
10097 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010098 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010100 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010101 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010102 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010103 }
10104
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010105 /* NOTE: the following code can't call back into Python code,
10106 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010107 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010108
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010109 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010110 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010111 res = _PyUnicode_JoinArray(separator, items, seqlen);
10112 Py_DECREF(fseq);
10113 return res;
10114}
10115
10116PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010117_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010118{
10119 PyObject *res = NULL; /* the result */
10120 PyObject *sep = NULL;
10121 Py_ssize_t seplen;
10122 PyObject *item;
10123 Py_ssize_t sz, i, res_offset;
10124 Py_UCS4 maxchar;
10125 Py_UCS4 item_maxchar;
10126 int use_memcpy;
10127 unsigned char *res_data = NULL, *sep_data = NULL;
10128 PyObject *last_obj;
10129 unsigned int kind = 0;
10130
Tim Peters05eba1f2004-08-27 21:32:02 +000010131 /* If empty sequence, return u"". */
10132 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010133 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010134 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010135
Tim Peters05eba1f2004-08-27 21:32:02 +000010136 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010137 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010138 if (seqlen == 1) {
10139 if (PyUnicode_CheckExact(items[0])) {
10140 res = items[0];
10141 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010142 return res;
10143 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010144 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010145 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010146 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010147 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010148 /* Set up sep and seplen */
10149 if (separator == NULL) {
10150 /* fall back to a blank space separator */
10151 sep = PyUnicode_FromOrdinal(' ');
10152 if (!sep)
10153 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010154 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010155 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010156 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010157 else {
10158 if (!PyUnicode_Check(separator)) {
10159 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010160 "separator: expected str instance,"
10161 " %.80s found",
10162 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010163 goto onError;
10164 }
10165 if (PyUnicode_READY(separator))
10166 goto onError;
10167 sep = separator;
10168 seplen = PyUnicode_GET_LENGTH(separator);
10169 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10170 /* inc refcount to keep this code path symmetric with the
10171 above case of a blank separator */
10172 Py_INCREF(sep);
10173 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010174 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010175 }
10176
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010177 /* There are at least two things to join, or else we have a subclass
10178 * of str in the sequence.
10179 * Do a pre-pass to figure out the total amount of space we'll
10180 * need (sz), and see whether all argument are strings.
10181 */
10182 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010183#ifdef Py_DEBUG
10184 use_memcpy = 0;
10185#else
10186 use_memcpy = 1;
10187#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010188 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010189 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010190 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010191 if (!PyUnicode_Check(item)) {
10192 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010193 "sequence item %zd: expected str instance,"
10194 " %.80s found",
10195 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010196 goto onError;
10197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 if (PyUnicode_READY(item) == -1)
10199 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010200 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010202 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010203 if (i != 0) {
10204 add_sz += seplen;
10205 }
10206 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010207 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010208 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010209 goto onError;
10210 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010211 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010212 if (use_memcpy && last_obj != NULL) {
10213 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10214 use_memcpy = 0;
10215 }
10216 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010217 }
Tim Petersced69f82003-09-16 20:30:58 +000010218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010220 if (res == NULL)
10221 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010222
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010223 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010224#ifdef Py_DEBUG
10225 use_memcpy = 0;
10226#else
10227 if (use_memcpy) {
10228 res_data = PyUnicode_1BYTE_DATA(res);
10229 kind = PyUnicode_KIND(res);
10230 if (seplen != 0)
10231 sep_data = PyUnicode_1BYTE_DATA(sep);
10232 }
10233#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010234 if (use_memcpy) {
10235 for (i = 0; i < seqlen; ++i) {
10236 Py_ssize_t itemlen;
10237 item = items[i];
10238
10239 /* Copy item, and maybe the separator. */
10240 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010241 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010242 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010243 kind * seplen);
10244 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010245 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010246
10247 itemlen = PyUnicode_GET_LENGTH(item);
10248 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010249 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010250 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010251 kind * itemlen);
10252 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010253 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010254 }
10255 assert(res_data == PyUnicode_1BYTE_DATA(res)
10256 + kind * PyUnicode_GET_LENGTH(res));
10257 }
10258 else {
10259 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10260 Py_ssize_t itemlen;
10261 item = items[i];
10262
10263 /* Copy item, and maybe the separator. */
10264 if (i && seplen != 0) {
10265 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10266 res_offset += seplen;
10267 }
10268
10269 itemlen = PyUnicode_GET_LENGTH(item);
10270 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010271 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010272 res_offset += itemlen;
10273 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010274 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010275 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010276 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010279 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281
Benjamin Peterson29060642009-01-31 22:14:21 +000010282 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010284 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285 return NULL;
10286}
10287
Victor Stinnerd3f08822012-05-29 12:57:52 +020010288void
10289_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10290 Py_UCS4 fill_char)
10291{
10292 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010293 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010294 assert(PyUnicode_IS_READY(unicode));
10295 assert(unicode_modifiable(unicode));
10296 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10297 assert(start >= 0);
10298 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010299 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010300}
10301
Victor Stinner3fe55312012-01-04 00:33:50 +010010302Py_ssize_t
10303PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10304 Py_UCS4 fill_char)
10305{
10306 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010307
10308 if (!PyUnicode_Check(unicode)) {
10309 PyErr_BadInternalCall();
10310 return -1;
10311 }
10312 if (PyUnicode_READY(unicode) == -1)
10313 return -1;
10314 if (unicode_check_modifiable(unicode))
10315 return -1;
10316
Victor Stinnerd3f08822012-05-29 12:57:52 +020010317 if (start < 0) {
10318 PyErr_SetString(PyExc_IndexError, "string index out of range");
10319 return -1;
10320 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010321 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10322 PyErr_SetString(PyExc_ValueError,
10323 "fill character is bigger than "
10324 "the string maximum character");
10325 return -1;
10326 }
10327
10328 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10329 length = Py_MIN(maxlen, length);
10330 if (length <= 0)
10331 return 0;
10332
Victor Stinnerd3f08822012-05-29 12:57:52 +020010333 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010334 return length;
10335}
10336
Victor Stinner9310abb2011-10-05 00:59:23 +020010337static PyObject *
10338pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010339 Py_ssize_t left,
10340 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 PyObject *u;
10344 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010345 int kind;
10346 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347
10348 if (left < 0)
10349 left = 0;
10350 if (right < 0)
10351 right = 0;
10352
Victor Stinnerc4b49542011-12-11 22:44:26 +010010353 if (left == 0 && right == 0)
10354 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10357 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010358 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10359 return NULL;
10360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010362 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010364 if (!u)
10365 return NULL;
10366
10367 kind = PyUnicode_KIND(u);
10368 data = PyUnicode_DATA(u);
10369 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010370 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010371 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010372 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010373 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010374 assert(_PyUnicode_CheckConsistency(u, 1));
10375 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376}
10377
Alexander Belopolsky40018472011-02-26 01:02:56 +000010378PyObject *
10379PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010383 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385
Benjamin Petersonead6b532011-12-20 17:23:42 -060010386 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010388 if (PyUnicode_IS_ASCII(string))
10389 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010390 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391 PyUnicode_GET_LENGTH(string), keepends);
10392 else
10393 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010394 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010395 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 break;
10397 case PyUnicode_2BYTE_KIND:
10398 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 PyUnicode_GET_LENGTH(string), keepends);
10401 break;
10402 case PyUnicode_4BYTE_KIND:
10403 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010404 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 PyUnicode_GET_LENGTH(string), keepends);
10406 break;
10407 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010408 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411}
10412
Alexander Belopolsky40018472011-02-26 01:02:56 +000010413static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010414split(PyObject *self,
10415 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010416 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010418 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010419 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 Py_ssize_t len1, len2;
10421 PyObject* out;
10422
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010424 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 if (PyUnicode_READY(self) == -1)
10427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010430 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010432 if (PyUnicode_IS_ASCII(self))
10433 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010434 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010435 PyUnicode_GET_LENGTH(self), maxcount
10436 );
10437 else
10438 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010439 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010440 PyUnicode_GET_LENGTH(self), maxcount
10441 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 case PyUnicode_2BYTE_KIND:
10443 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010444 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 PyUnicode_GET_LENGTH(self), maxcount
10446 );
10447 case PyUnicode_4BYTE_KIND:
10448 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010449 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 PyUnicode_GET_LENGTH(self), maxcount
10451 );
10452 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010453 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 }
10455
10456 if (PyUnicode_READY(substring) == -1)
10457 return NULL;
10458
10459 kind1 = PyUnicode_KIND(self);
10460 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 len1 = PyUnicode_GET_LENGTH(self);
10462 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010463 if (kind1 < kind2 || len1 < len2) {
10464 out = PyList_New(1);
10465 if (out == NULL)
10466 return NULL;
10467 Py_INCREF(self);
10468 PyList_SET_ITEM(out, 0, self);
10469 return out;
10470 }
10471 buf1 = PyUnicode_DATA(self);
10472 buf2 = PyUnicode_DATA(substring);
10473 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010474 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010475 if (!buf2)
10476 return NULL;
10477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010479 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010481 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10482 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010483 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010484 else
10485 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010486 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 break;
10488 case PyUnicode_2BYTE_KIND:
10489 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010490 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 break;
10492 case PyUnicode_4BYTE_KIND:
10493 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010494 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 break;
10496 default:
10497 out = NULL;
10498 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010499 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010500 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010501 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503}
10504
Alexander Belopolsky40018472011-02-26 01:02:56 +000010505static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010506rsplit(PyObject *self,
10507 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010508 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010509{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010510 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010511 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 Py_ssize_t len1, len2;
10513 PyObject* out;
10514
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010515 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010516 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 if (PyUnicode_READY(self) == -1)
10519 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010522 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010524 if (PyUnicode_IS_ASCII(self))
10525 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010526 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010527 PyUnicode_GET_LENGTH(self), maxcount
10528 );
10529 else
10530 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010531 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010532 PyUnicode_GET_LENGTH(self), maxcount
10533 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 case PyUnicode_2BYTE_KIND:
10535 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010536 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 PyUnicode_GET_LENGTH(self), maxcount
10538 );
10539 case PyUnicode_4BYTE_KIND:
10540 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010541 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 PyUnicode_GET_LENGTH(self), maxcount
10543 );
10544 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010545 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 }
10547
10548 if (PyUnicode_READY(substring) == -1)
10549 return NULL;
10550
10551 kind1 = PyUnicode_KIND(self);
10552 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 len1 = PyUnicode_GET_LENGTH(self);
10554 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010555 if (kind1 < kind2 || len1 < len2) {
10556 out = PyList_New(1);
10557 if (out == NULL)
10558 return NULL;
10559 Py_INCREF(self);
10560 PyList_SET_ITEM(out, 0, self);
10561 return out;
10562 }
10563 buf1 = PyUnicode_DATA(self);
10564 buf2 = PyUnicode_DATA(substring);
10565 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010566 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010567 if (!buf2)
10568 return NULL;
10569 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010571 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010573 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10574 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010575 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010576 else
10577 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010578 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 break;
10580 case PyUnicode_2BYTE_KIND:
10581 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010582 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 break;
10584 case PyUnicode_4BYTE_KIND:
10585 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010586 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 break;
10588 default:
10589 out = NULL;
10590 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010591 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010592 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010593 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 return out;
10595}
10596
10597static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010598anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10599 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010601 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010603 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10604 return asciilib_find(buf1, len1, buf2, len2, offset);
10605 else
10606 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 case PyUnicode_2BYTE_KIND:
10608 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10609 case PyUnicode_4BYTE_KIND:
10610 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10611 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010612 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613}
10614
10615static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010616anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10617 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010619 switch (kind) {
10620 case PyUnicode_1BYTE_KIND:
10621 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10622 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10623 else
10624 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10625 case PyUnicode_2BYTE_KIND:
10626 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10627 case PyUnicode_4BYTE_KIND:
10628 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10629 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010630 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010631}
10632
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010633static void
10634replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10635 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10636{
10637 int kind = PyUnicode_KIND(u);
10638 void *data = PyUnicode_DATA(u);
10639 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10640 if (kind == PyUnicode_1BYTE_KIND) {
10641 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10642 (Py_UCS1 *)data + len,
10643 u1, u2, maxcount);
10644 }
10645 else if (kind == PyUnicode_2BYTE_KIND) {
10646 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10647 (Py_UCS2 *)data + len,
10648 u1, u2, maxcount);
10649 }
10650 else {
10651 assert(kind == PyUnicode_4BYTE_KIND);
10652 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10653 (Py_UCS4 *)data + len,
10654 u1, u2, maxcount);
10655 }
10656}
10657
Alexander Belopolsky40018472011-02-26 01:02:56 +000010658static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659replace(PyObject *self, PyObject *str1,
10660 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010663 const char *sbuf = PyUnicode_DATA(self);
10664 const void *buf1 = PyUnicode_DATA(str1);
10665 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 int srelease = 0, release1 = 0, release2 = 0;
10667 int skind = PyUnicode_KIND(self);
10668 int kind1 = PyUnicode_KIND(str1);
10669 int kind2 = PyUnicode_KIND(str2);
10670 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10671 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10672 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010673 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010674 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010676 if (slen < len1)
10677 goto nothing;
10678
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010680 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010681 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010682 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683
Victor Stinner59de0ee2011-10-07 10:01:28 +020010684 if (str1 == str2)
10685 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010688 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10689 if (maxchar < maxchar_str1)
10690 /* substring too wide to be present */
10691 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010692 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10693 /* Replacing str1 with str2 may cause a maxchar reduction in the
10694 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010695 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010696 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010699 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010701 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010703 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010704 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010705 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010706
Victor Stinner69ed0f42013-04-09 21:48:24 +020010707 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010708 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010709 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010710 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010711 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010713 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010715
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010716 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10717 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010718 }
10719 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 int rkind = skind;
10721 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010722 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 if (kind1 < rkind) {
10725 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010726 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 if (!buf1) goto error;
10728 release1 = 1;
10729 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010730 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010731 if (i < 0)
10732 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 if (rkind > kind2) {
10734 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010735 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 if (!buf2) goto error;
10737 release2 = 1;
10738 }
10739 else if (rkind < kind2) {
10740 /* widen self and buf1 */
10741 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010742 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010743 assert(buf1 != PyUnicode_DATA(str1));
10744 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010745 buf1 = PyUnicode_DATA(str1);
10746 release1 = 0;
10747 }
10748 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (!sbuf) goto error;
10750 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010751 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 if (!buf1) goto error;
10753 release1 = 1;
10754 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010755 u = PyUnicode_New(slen, maxchar);
10756 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010758 assert(PyUnicode_KIND(u) == rkind);
10759 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010760
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010761 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010762 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010763 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010765 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010767
10768 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010769 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010770 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010771 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010772 if (i == -1)
10773 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010774 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010776 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010780 }
10781 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010783 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 int rkind = skind;
10785 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010788 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010789 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 if (!buf1) goto error;
10791 release1 = 1;
10792 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010793 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010794 if (n == 0)
10795 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010797 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010798 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 if (!buf2) goto error;
10800 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010803 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010805 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 if (!sbuf) goto error;
10807 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010808 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010809 assert(buf1 != PyUnicode_DATA(str1));
10810 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010811 buf1 = PyUnicode_DATA(str1);
10812 release1 = 0;
10813 }
10814 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 if (!buf1) goto error;
10816 release1 = 1;
10817 }
10818 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10819 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010820 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 PyErr_SetString(PyExc_OverflowError,
10822 "replace string is too long");
10823 goto error;
10824 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010825 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010826 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010827 _Py_INCREF_UNICODE_EMPTY();
10828 if (!unicode_empty)
10829 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010830 u = unicode_empty;
10831 goto done;
10832 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010833 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 PyErr_SetString(PyExc_OverflowError,
10835 "replace string is too long");
10836 goto error;
10837 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010838 u = PyUnicode_New(new_size, maxchar);
10839 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010841 assert(PyUnicode_KIND(u) == rkind);
10842 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 ires = i = 0;
10844 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010845 while (n-- > 0) {
10846 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010847 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010848 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010849 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010850 if (j == -1)
10851 break;
10852 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010853 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010854 memcpy(res + rkind * ires,
10855 sbuf + rkind * i,
10856 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010858 }
10859 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010861 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010863 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010869 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010870 memcpy(res + rkind * ires,
10871 sbuf + rkind * i,
10872 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010873 }
10874 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010875 /* interleave */
10876 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010877 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010879 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010881 if (--n <= 0)
10882 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010883 memcpy(res + rkind * ires,
10884 sbuf + rkind * i,
10885 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 ires++;
10887 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010888 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010889 memcpy(res + rkind * ires,
10890 sbuf + rkind * i,
10891 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010892 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010893 }
10894
10895 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010896 unicode_adjust_maxchar(&u);
10897 if (u == NULL)
10898 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010900
10901 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010902 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10903 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10904 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010906 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010908 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010910 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010911 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010913
Benjamin Peterson29060642009-01-31 22:14:21 +000010914 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010915 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010916 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10917 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10918 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010920 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010922 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010924 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010925 return unicode_result_unchanged(self);
10926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010928 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10929 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10930 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10931 if (srelease)
10932 PyMem_FREE((void *)sbuf);
10933 if (release1)
10934 PyMem_FREE((void *)buf1);
10935 if (release2)
10936 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938}
10939
10940/* --- Unicode Object Methods --------------------------------------------- */
10941
INADA Naoki3ae20562017-01-16 20:41:20 +090010942/*[clinic input]
10943str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
INADA Naoki3ae20562017-01-16 20:41:20 +090010945Return a version of the string where each word is titlecased.
10946
10947More specifically, words start with uppercased characters and all remaining
10948cased characters have lower case.
10949[clinic start generated code]*/
10950
10951static PyObject *
10952unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010953/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010955 if (PyUnicode_READY(self) == -1)
10956 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010957 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958}
10959
INADA Naoki3ae20562017-01-16 20:41:20 +090010960/*[clinic input]
10961str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
INADA Naoki3ae20562017-01-16 20:41:20 +090010963Return a capitalized version of the string.
10964
10965More specifically, make the first character have upper case and the rest lower
10966case.
10967[clinic start generated code]*/
10968
10969static PyObject *
10970unicode_capitalize_impl(PyObject *self)
10971/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010973 if (PyUnicode_READY(self) == -1)
10974 return NULL;
10975 if (PyUnicode_GET_LENGTH(self) == 0)
10976 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010977 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978}
10979
INADA Naoki3ae20562017-01-16 20:41:20 +090010980/*[clinic input]
10981str.casefold as unicode_casefold
10982
10983Return a version of the string suitable for caseless comparisons.
10984[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010985
10986static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010987unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010988/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010989{
10990 if (PyUnicode_READY(self) == -1)
10991 return NULL;
10992 if (PyUnicode_IS_ASCII(self))
10993 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010994 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010995}
10996
10997
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010998/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010999
11000static int
11001convert_uc(PyObject *obj, void *addr)
11002{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011004
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011005 if (!PyUnicode_Check(obj)) {
11006 PyErr_Format(PyExc_TypeError,
11007 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011008 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011009 return 0;
11010 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011011 if (PyUnicode_READY(obj) < 0)
11012 return 0;
11013 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011014 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011015 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011016 return 0;
11017 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011018 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011019 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011020}
11021
INADA Naoki3ae20562017-01-16 20:41:20 +090011022/*[clinic input]
11023str.center as unicode_center
11024
11025 width: Py_ssize_t
11026 fillchar: Py_UCS4 = ' '
11027 /
11028
11029Return a centered string of length width.
11030
11031Padding is done using the specified fill character (default is a space).
11032[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033
11034static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011035unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11036/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011038 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039
Benjamin Petersonbac79492012-01-14 13:34:47 -050011040 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041 return NULL;
11042
Victor Stinnerc4b49542011-12-11 22:44:26 +010011043 if (PyUnicode_GET_LENGTH(self) >= width)
11044 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
Victor Stinnerc4b49542011-12-11 22:44:26 +010011046 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047 left = marg / 2 + (marg & width & 1);
11048
Victor Stinner9310abb2011-10-05 00:59:23 +020011049 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050}
11051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052/* This function assumes that str1 and str2 are readied by the caller. */
11053
Marc-André Lemburge5034372000-08-08 08:04:29 +000011054static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011055unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011056{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011057#define COMPARE(TYPE1, TYPE2) \
11058 do { \
11059 TYPE1* p1 = (TYPE1 *)data1; \
11060 TYPE2* p2 = (TYPE2 *)data2; \
11061 TYPE1* end = p1 + len; \
11062 Py_UCS4 c1, c2; \
11063 for (; p1 != end; p1++, p2++) { \
11064 c1 = *p1; \
11065 c2 = *p2; \
11066 if (c1 != c2) \
11067 return (c1 < c2) ? -1 : 1; \
11068 } \
11069 } \
11070 while (0)
11071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011073 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011074 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 kind1 = PyUnicode_KIND(str1);
11077 kind2 = PyUnicode_KIND(str2);
11078 data1 = PyUnicode_DATA(str1);
11079 data2 = PyUnicode_DATA(str2);
11080 len1 = PyUnicode_GET_LENGTH(str1);
11081 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011082 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011083
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011084 switch(kind1) {
11085 case PyUnicode_1BYTE_KIND:
11086 {
11087 switch(kind2) {
11088 case PyUnicode_1BYTE_KIND:
11089 {
11090 int cmp = memcmp(data1, data2, len);
11091 /* normalize result of memcmp() into the range [-1; 1] */
11092 if (cmp < 0)
11093 return -1;
11094 if (cmp > 0)
11095 return 1;
11096 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011097 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011098 case PyUnicode_2BYTE_KIND:
11099 COMPARE(Py_UCS1, Py_UCS2);
11100 break;
11101 case PyUnicode_4BYTE_KIND:
11102 COMPARE(Py_UCS1, Py_UCS4);
11103 break;
11104 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011105 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011106 }
11107 break;
11108 }
11109 case PyUnicode_2BYTE_KIND:
11110 {
11111 switch(kind2) {
11112 case PyUnicode_1BYTE_KIND:
11113 COMPARE(Py_UCS2, Py_UCS1);
11114 break;
11115 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011116 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011117 COMPARE(Py_UCS2, Py_UCS2);
11118 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011119 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011120 case PyUnicode_4BYTE_KIND:
11121 COMPARE(Py_UCS2, Py_UCS4);
11122 break;
11123 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011124 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011125 }
11126 break;
11127 }
11128 case PyUnicode_4BYTE_KIND:
11129 {
11130 switch(kind2) {
11131 case PyUnicode_1BYTE_KIND:
11132 COMPARE(Py_UCS4, Py_UCS1);
11133 break;
11134 case PyUnicode_2BYTE_KIND:
11135 COMPARE(Py_UCS4, Py_UCS2);
11136 break;
11137 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011138 {
11139#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11140 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11141 /* normalize result of wmemcmp() into the range [-1; 1] */
11142 if (cmp < 0)
11143 return -1;
11144 if (cmp > 0)
11145 return 1;
11146#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011147 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011148#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011149 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011150 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011151 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011152 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011153 }
11154 break;
11155 }
11156 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011157 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011158 }
11159
Victor Stinner770e19e2012-10-04 22:59:45 +020011160 if (len1 == len2)
11161 return 0;
11162 if (len1 < len2)
11163 return -1;
11164 else
11165 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011166
11167#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011168}
11169
Benjamin Peterson621b4302016-09-09 13:54:34 -070011170static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011171unicode_compare_eq(PyObject *str1, PyObject *str2)
11172{
11173 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011174 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011175 Py_ssize_t len;
11176 int cmp;
11177
Victor Stinnere5567ad2012-10-23 02:48:49 +020011178 len = PyUnicode_GET_LENGTH(str1);
11179 if (PyUnicode_GET_LENGTH(str2) != len)
11180 return 0;
11181 kind = PyUnicode_KIND(str1);
11182 if (PyUnicode_KIND(str2) != kind)
11183 return 0;
11184 data1 = PyUnicode_DATA(str1);
11185 data2 = PyUnicode_DATA(str2);
11186
11187 cmp = memcmp(data1, data2, len * kind);
11188 return (cmp == 0);
11189}
11190
11191
Alexander Belopolsky40018472011-02-26 01:02:56 +000011192int
11193PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11196 if (PyUnicode_READY(left) == -1 ||
11197 PyUnicode_READY(right) == -1)
11198 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011199
11200 /* a string is equal to itself */
11201 if (left == right)
11202 return 0;
11203
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011204 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011206 PyErr_Format(PyExc_TypeError,
11207 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011208 Py_TYPE(left)->tp_name,
11209 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210 return -1;
11211}
11212
Martin v. Löwis5b222132007-06-10 09:51:05 +000011213int
11214PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 Py_ssize_t i;
11217 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011219 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220
Victor Stinner910337b2011-10-03 03:20:16 +020011221 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011222 if (!PyUnicode_IS_READY(uni)) {
11223 const wchar_t *ws = _PyUnicode_WSTR(uni);
11224 /* Compare Unicode string and source character set string */
11225 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11226 if (chr != ustr[i])
11227 return (chr < ustr[i]) ? -1 : 1;
11228 }
11229 /* This check keeps Python strings that end in '\0' from comparing equal
11230 to C strings identical up to that point. */
11231 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11232 return 1; /* uni is longer */
11233 if (ustr[i])
11234 return -1; /* str is longer */
11235 return 0;
11236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011238 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011239 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011240 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011241 size_t len, len2 = strlen(str);
11242 int cmp;
11243
11244 len = Py_MIN(len1, len2);
11245 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011246 if (cmp != 0) {
11247 if (cmp < 0)
11248 return -1;
11249 else
11250 return 1;
11251 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011252 if (len1 > len2)
11253 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011254 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011255 return -1; /* str is longer */
11256 return 0;
11257 }
11258 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011259 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011260 /* Compare Unicode string and source character set string */
11261 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011262 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011263 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11264 /* This check keeps Python strings that end in '\0' from comparing equal
11265 to C strings identical up to that point. */
11266 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11267 return 1; /* uni is longer */
11268 if (str[i])
11269 return -1; /* str is longer */
11270 return 0;
11271 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011272}
11273
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011274static int
11275non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11276{
11277 size_t i, len;
11278 const wchar_t *p;
11279 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11280 if (strlen(str) != len)
11281 return 0;
11282 p = _PyUnicode_WSTR(unicode);
11283 assert(p);
11284 for (i = 0; i < len; i++) {
11285 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011286 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011287 return 0;
11288 }
11289 return 1;
11290}
11291
11292int
11293_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11294{
11295 size_t len;
11296 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011297 assert(str);
11298#ifndef NDEBUG
11299 for (const char *p = str; *p; p++) {
11300 assert((unsigned char)*p < 128);
11301 }
11302#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011303 if (PyUnicode_READY(unicode) == -1) {
11304 /* Memory error or bad data */
11305 PyErr_Clear();
11306 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11307 }
11308 if (!PyUnicode_IS_ASCII(unicode))
11309 return 0;
11310 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11311 return strlen(str) == len &&
11312 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11313}
11314
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011315int
11316_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11317{
11318 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011319
11320 assert(_PyUnicode_CHECK(left));
11321 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011322#ifndef NDEBUG
11323 for (const char *p = right->string; *p; p++) {
11324 assert((unsigned char)*p < 128);
11325 }
11326#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011327
11328 if (PyUnicode_READY(left) == -1) {
11329 /* memory error or bad data */
11330 PyErr_Clear();
11331 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11332 }
11333
11334 if (!PyUnicode_IS_ASCII(left))
11335 return 0;
11336
11337 right_uni = _PyUnicode_FromId(right); /* borrowed */
11338 if (right_uni == NULL) {
11339 /* memory error or bad data */
11340 PyErr_Clear();
11341 return _PyUnicode_EqualToASCIIString(left, right->string);
11342 }
11343
11344 if (left == right_uni)
11345 return 1;
11346
11347 if (PyUnicode_CHECK_INTERNED(left))
11348 return 0;
11349
Victor Stinner607b1022020-05-05 18:50:30 +020011350#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011351 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011352 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011353 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11354 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011355#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011356
11357 return unicode_compare_eq(left, right_uni);
11358}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011359
Alexander Belopolsky40018472011-02-26 01:02:56 +000011360PyObject *
11361PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011362{
11363 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011364
Victor Stinnere5567ad2012-10-23 02:48:49 +020011365 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11366 Py_RETURN_NOTIMPLEMENTED;
11367
11368 if (PyUnicode_READY(left) == -1 ||
11369 PyUnicode_READY(right) == -1)
11370 return NULL;
11371
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011372 if (left == right) {
11373 switch (op) {
11374 case Py_EQ:
11375 case Py_LE:
11376 case Py_GE:
11377 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011378 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011379 case Py_NE:
11380 case Py_LT:
11381 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011382 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011383 default:
11384 PyErr_BadArgument();
11385 return NULL;
11386 }
11387 }
11388 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011389 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011390 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011391 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011392 }
11393 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011394 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011395 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011396 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011397}
11398
Alexander Belopolsky40018472011-02-26 01:02:56 +000011399int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011400_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11401{
11402 return unicode_eq(aa, bb);
11403}
11404
11405int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011406PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011407{
Victor Stinner77282cb2013-04-14 19:22:47 +020011408 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011409 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011411 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011412
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011413 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011414 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011415 "'in <string>' requires string as left operand, not %.100s",
11416 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011417 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011418 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011419 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011420 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011421 if (ensure_unicode(str) < 0)
11422 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011425 kind2 = PyUnicode_KIND(substr);
11426 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011427 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011429 len2 = PyUnicode_GET_LENGTH(substr);
11430 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011431 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011432 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011433 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011434 if (len2 == 1) {
11435 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11436 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011437 return result;
11438 }
11439 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011440 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011441 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011442 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444
Victor Stinner77282cb2013-04-14 19:22:47 +020011445 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 case PyUnicode_1BYTE_KIND:
11447 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11448 break;
11449 case PyUnicode_2BYTE_KIND:
11450 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11451 break;
11452 case PyUnicode_4BYTE_KIND:
11453 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11454 break;
11455 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011456 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011458
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011459 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011460 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011461 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462
Guido van Rossum403d68b2000-03-13 15:55:09 +000011463 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011464}
11465
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466/* Concat to string or Unicode object giving a new Unicode object. */
11467
Alexander Belopolsky40018472011-02-26 01:02:56 +000011468PyObject *
11469PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011471 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011472 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011473 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011475 if (ensure_unicode(left) < 0)
11476 return NULL;
11477
11478 if (!PyUnicode_Check(right)) {
11479 PyErr_Format(PyExc_TypeError,
11480 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011481 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011482 return NULL;
11483 }
11484 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011485 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486
11487 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011488 if (left == unicode_empty)
11489 return PyUnicode_FromObject(right);
11490 if (right == unicode_empty)
11491 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011493 left_len = PyUnicode_GET_LENGTH(left);
11494 right_len = PyUnicode_GET_LENGTH(right);
11495 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011496 PyErr_SetString(PyExc_OverflowError,
11497 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011498 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011499 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011500 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011501
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011502 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11503 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011504 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011507 result = PyUnicode_New(new_len, maxchar);
11508 if (result == NULL)
11509 return NULL;
11510 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11511 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11512 assert(_PyUnicode_CheckConsistency(result, 1));
11513 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514}
11515
Walter Dörwald1ab83302007-05-18 17:15:44 +000011516void
Victor Stinner23e56682011-10-03 03:54:37 +020011517PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011518{
Victor Stinner23e56682011-10-03 03:54:37 +020011519 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011520 Py_UCS4 maxchar, maxchar2;
11521 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011522
11523 if (p_left == NULL) {
11524 if (!PyErr_Occurred())
11525 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011526 return;
11527 }
Victor Stinner23e56682011-10-03 03:54:37 +020011528 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011529 if (right == NULL || left == NULL
11530 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011531 if (!PyErr_Occurred())
11532 PyErr_BadInternalCall();
11533 goto error;
11534 }
11535
Benjamin Petersonbac79492012-01-14 13:34:47 -050011536 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011537 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011538 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011539 goto error;
11540
Victor Stinner488fa492011-12-12 00:01:39 +010011541 /* Shortcuts */
11542 if (left == unicode_empty) {
11543 Py_DECREF(left);
11544 Py_INCREF(right);
11545 *p_left = right;
11546 return;
11547 }
11548 if (right == unicode_empty)
11549 return;
11550
11551 left_len = PyUnicode_GET_LENGTH(left);
11552 right_len = PyUnicode_GET_LENGTH(right);
11553 if (left_len > PY_SSIZE_T_MAX - right_len) {
11554 PyErr_SetString(PyExc_OverflowError,
11555 "strings are too large to concat");
11556 goto error;
11557 }
11558 new_len = left_len + right_len;
11559
11560 if (unicode_modifiable(left)
11561 && PyUnicode_CheckExact(right)
11562 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011563 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11564 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011565 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011566 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011567 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11568 {
11569 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011570 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011571 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011572
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011573 /* copy 'right' into the newly allocated area of 'left' */
11574 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011575 }
Victor Stinner488fa492011-12-12 00:01:39 +010011576 else {
11577 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11578 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011579 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011580
Victor Stinner488fa492011-12-12 00:01:39 +010011581 /* Concat the two Unicode strings */
11582 res = PyUnicode_New(new_len, maxchar);
11583 if (res == NULL)
11584 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011585 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11586 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011587 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011588 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011589 }
11590 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011591 return;
11592
11593error:
Victor Stinner488fa492011-12-12 00:01:39 +010011594 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011595}
11596
11597void
11598PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11599{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011600 PyUnicode_Append(pleft, right);
11601 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011602}
11603
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011604/*
11605Wraps stringlib_parse_args_finds() and additionally ensures that the
11606first argument is a unicode object.
11607*/
11608
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011609static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011610parse_args_finds_unicode(const char * function_name, PyObject *args,
11611 PyObject **substring,
11612 Py_ssize_t *start, Py_ssize_t *end)
11613{
11614 if(stringlib_parse_args_finds(function_name, args, substring,
11615 start, end)) {
11616 if (ensure_unicode(*substring) < 0)
11617 return 0;
11618 return 1;
11619 }
11620 return 0;
11621}
11622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011626Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011627string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011628interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
11630static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011631unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011633 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011634 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011635 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011637 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011638 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011641 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 kind1 = PyUnicode_KIND(self);
11645 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011646 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011647 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 len1 = PyUnicode_GET_LENGTH(self);
11650 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011652 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011653 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011654
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011655 buf1 = PyUnicode_DATA(self);
11656 buf2 = PyUnicode_DATA(substring);
11657 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011658 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011659 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011660 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011661 }
11662 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 case PyUnicode_1BYTE_KIND:
11664 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011665 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 buf2, len2, PY_SSIZE_T_MAX
11667 );
11668 break;
11669 case PyUnicode_2BYTE_KIND:
11670 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011671 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 buf2, len2, PY_SSIZE_T_MAX
11673 );
11674 break;
11675 case PyUnicode_4BYTE_KIND:
11676 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011677 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 buf2, len2, PY_SSIZE_T_MAX
11679 );
11680 break;
11681 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011682 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 }
11684
11685 result = PyLong_FromSsize_t(iresult);
11686
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011687 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011688 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011689 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691 return result;
11692}
11693
INADA Naoki3ae20562017-01-16 20:41:20 +090011694/*[clinic input]
11695str.encode as unicode_encode
11696
11697 encoding: str(c_default="NULL") = 'utf-8'
11698 The encoding in which to encode the string.
11699 errors: str(c_default="NULL") = 'strict'
11700 The error handling scheme to use for encoding errors.
11701 The default is 'strict' meaning that encoding errors raise a
11702 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11703 'xmlcharrefreplace' as well as any other name registered with
11704 codecs.register_error that can handle UnicodeEncodeErrors.
11705
11706Encode the string using the codec registered for encoding.
11707[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708
11709static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011710unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011711/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011713 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011714}
11715
INADA Naoki3ae20562017-01-16 20:41:20 +090011716/*[clinic input]
11717str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718
INADA Naoki3ae20562017-01-16 20:41:20 +090011719 tabsize: int = 8
11720
11721Return a copy where all tab characters are expanded using spaces.
11722
11723If tabsize is not given, a tab size of 8 characters is assumed.
11724[clinic start generated code]*/
11725
11726static PyObject *
11727unicode_expandtabs_impl(PyObject *self, int tabsize)
11728/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011730 Py_ssize_t i, j, line_pos, src_len, incr;
11731 Py_UCS4 ch;
11732 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011733 const void *src_data;
11734 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011735 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011736 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
Antoine Pitrou22425222011-10-04 19:10:51 +020011738 if (PyUnicode_READY(self) == -1)
11739 return NULL;
11740
Thomas Wouters7e474022000-07-16 12:04:32 +000011741 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011742 src_len = PyUnicode_GET_LENGTH(self);
11743 i = j = line_pos = 0;
11744 kind = PyUnicode_KIND(self);
11745 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011746 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011747 for (; i < src_len; i++) {
11748 ch = PyUnicode_READ(kind, src_data, i);
11749 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011750 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011752 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011754 goto overflow;
11755 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011757 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011761 goto overflow;
11762 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011764 if (ch == '\n' || ch == '\r')
11765 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011767 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011768 if (!found)
11769 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011770
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011772 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773 if (!u)
11774 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011775 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776
Antoine Pitroue71d5742011-10-04 15:55:09 +020011777 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
Antoine Pitroue71d5742011-10-04 15:55:09 +020011779 for (; i < src_len; i++) {
11780 ch = PyUnicode_READ(kind, src_data, i);
11781 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011783 incr = tabsize - (line_pos % tabsize);
11784 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011785 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011786 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011788 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011789 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011790 line_pos++;
11791 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011792 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011793 if (ch == '\n' || ch == '\r')
11794 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011796 }
11797 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011798 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011799
Antoine Pitroue71d5742011-10-04 15:55:09 +020011800 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011801 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11802 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803}
11804
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011805PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807\n\
11808Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011809such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810arguments start and end are interpreted as in slice notation.\n\
11811\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011812Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
11814static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011817 /* initialize variables to prevent gcc warning */
11818 PyObject *substring = NULL;
11819 Py_ssize_t start = 0;
11820 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011821 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011823 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011826 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011829 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 if (result == -2)
11832 return NULL;
11833
Christian Heimes217cfd12007-12-02 14:31:20 +000011834 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835}
11836
11837static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011838unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011840 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011841 enum PyUnicode_Kind kind;
11842 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011843
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011844 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011845 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011847 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011848 if (PyUnicode_READY(self) == -1) {
11849 return NULL;
11850 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011851 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11852 PyErr_SetString(PyExc_IndexError, "string index out of range");
11853 return NULL;
11854 }
11855 kind = PyUnicode_KIND(self);
11856 data = PyUnicode_DATA(self);
11857 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011858 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859}
11860
Guido van Rossumc2504932007-09-18 19:42:40 +000011861/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011862 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011863static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011864unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011866 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011867
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011868#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011869 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011870#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 if (_PyUnicode_HASH(self) != -1)
11872 return _PyUnicode_HASH(self);
11873 if (PyUnicode_READY(self) == -1)
11874 return -1;
animalizea1d14252019-01-02 20:16:06 +080011875
Christian Heimes985ecdc2013-11-20 11:46:18 +010011876 x = _Py_HashBytes(PyUnicode_DATA(self),
11877 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011879 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880}
11881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011882PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884\n\
oldkaa0735f2018-02-02 16:52:55 +080011885Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011886such that sub is contained within S[start:end]. Optional\n\
11887arguments start and end are interpreted as in slice notation.\n\
11888\n\
11889Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
11891static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011894 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011895 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011896 PyObject *substring = NULL;
11897 Py_ssize_t start = 0;
11898 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011900 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011903 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011906 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 if (result == -2)
11909 return NULL;
11910
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911 if (result < 0) {
11912 PyErr_SetString(PyExc_ValueError, "substring not found");
11913 return NULL;
11914 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011915
Christian Heimes217cfd12007-12-02 14:31:20 +000011916 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917}
11918
INADA Naoki3ae20562017-01-16 20:41:20 +090011919/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011920str.isascii as unicode_isascii
11921
11922Return True if all characters in the string are ASCII, False otherwise.
11923
11924ASCII characters have code points in the range U+0000-U+007F.
11925Empty string is ASCII too.
11926[clinic start generated code]*/
11927
11928static PyObject *
11929unicode_isascii_impl(PyObject *self)
11930/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11931{
11932 if (PyUnicode_READY(self) == -1) {
11933 return NULL;
11934 }
11935 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11936}
11937
11938/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011939str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940
INADA Naoki3ae20562017-01-16 20:41:20 +090011941Return True if the string is a lowercase string, False otherwise.
11942
11943A string is lowercase if all cased characters in the string are lowercase and
11944there is at least one cased character in the string.
11945[clinic start generated code]*/
11946
11947static PyObject *
11948unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011949/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 Py_ssize_t i, length;
11952 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011953 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954 int cased;
11955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 if (PyUnicode_READY(self) == -1)
11957 return NULL;
11958 length = PyUnicode_GET_LENGTH(self);
11959 kind = PyUnicode_KIND(self);
11960 data = PyUnicode_DATA(self);
11961
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (length == 1)
11964 return PyBool_FromLong(
11965 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011967 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011969 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011970
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 for (i = 0; i < length; i++) {
11973 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011974
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011976 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 else if (!cased && Py_UNICODE_ISLOWER(ch))
11978 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011980 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981}
11982
INADA Naoki3ae20562017-01-16 20:41:20 +090011983/*[clinic input]
11984str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985
INADA Naoki3ae20562017-01-16 20:41:20 +090011986Return True if the string is an uppercase string, False otherwise.
11987
11988A string is uppercase if all cased characters in the string are uppercase and
11989there is at least one cased character in the string.
11990[clinic start generated code]*/
11991
11992static PyObject *
11993unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011994/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 Py_ssize_t i, length;
11997 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011998 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999 int cased;
12000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 if (PyUnicode_READY(self) == -1)
12002 return NULL;
12003 length = PyUnicode_GET_LENGTH(self);
12004 kind = PyUnicode_KIND(self);
12005 data = PyUnicode_DATA(self);
12006
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (length == 1)
12009 return PyBool_FromLong(
12010 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012012 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012014 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012015
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 for (i = 0; i < length; i++) {
12018 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012019
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012021 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 else if (!cased && Py_UNICODE_ISUPPER(ch))
12023 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012025 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026}
12027
INADA Naoki3ae20562017-01-16 20:41:20 +090012028/*[clinic input]
12029str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030
INADA Naoki3ae20562017-01-16 20:41:20 +090012031Return True if the string is a title-cased string, False otherwise.
12032
12033In a title-cased string, upper- and title-case characters may only
12034follow uncased characters and lowercase characters only cased ones.
12035[clinic start generated code]*/
12036
12037static PyObject *
12038unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012039/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 Py_ssize_t i, length;
12042 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012043 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044 int cased, previous_is_cased;
12045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 if (PyUnicode_READY(self) == -1)
12047 return NULL;
12048 length = PyUnicode_GET_LENGTH(self);
12049 kind = PyUnicode_KIND(self);
12050 data = PyUnicode_DATA(self);
12051
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 if (length == 1) {
12054 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12055 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12056 (Py_UNICODE_ISUPPER(ch) != 0));
12057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012059 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012061 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012062
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063 cased = 0;
12064 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 for (i = 0; i < length; i++) {
12066 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012067
Benjamin Peterson29060642009-01-31 22:14:21 +000012068 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12069 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012070 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 previous_is_cased = 1;
12072 cased = 1;
12073 }
12074 else if (Py_UNICODE_ISLOWER(ch)) {
12075 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012076 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012077 previous_is_cased = 1;
12078 cased = 1;
12079 }
12080 else
12081 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012083 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084}
12085
INADA Naoki3ae20562017-01-16 20:41:20 +090012086/*[clinic input]
12087str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
INADA Naoki3ae20562017-01-16 20:41:20 +090012089Return True if the string is a whitespace string, False otherwise.
12090
12091A string is whitespace if all characters in the string are whitespace and there
12092is at least one character in the string.
12093[clinic start generated code]*/
12094
12095static PyObject *
12096unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012097/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 Py_ssize_t i, length;
12100 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012101 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102
12103 if (PyUnicode_READY(self) == -1)
12104 return NULL;
12105 length = PyUnicode_GET_LENGTH(self);
12106 kind = PyUnicode_KIND(self);
12107 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 if (length == 1)
12111 return PyBool_FromLong(
12112 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012114 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012116 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 for (i = 0; i < length; i++) {
12119 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012120 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012121 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012123 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124}
12125
INADA Naoki3ae20562017-01-16 20:41:20 +090012126/*[clinic input]
12127str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012128
INADA Naoki3ae20562017-01-16 20:41:20 +090012129Return True if the string is an alphabetic string, False otherwise.
12130
12131A string is alphabetic if all characters in the string are alphabetic and there
12132is at least one character in the string.
12133[clinic start generated code]*/
12134
12135static PyObject *
12136unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012137/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012138{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 Py_ssize_t i, length;
12140 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012141 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142
12143 if (PyUnicode_READY(self) == -1)
12144 return NULL;
12145 length = PyUnicode_GET_LENGTH(self);
12146 kind = PyUnicode_KIND(self);
12147 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012148
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012149 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 if (length == 1)
12151 return PyBool_FromLong(
12152 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012153
12154 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012156 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 for (i = 0; i < length; i++) {
12159 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012160 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012161 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012162 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012163}
12164
INADA Naoki3ae20562017-01-16 20:41:20 +090012165/*[clinic input]
12166str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012167
INADA Naoki3ae20562017-01-16 20:41:20 +090012168Return True if the string is an alpha-numeric string, False otherwise.
12169
12170A string is alpha-numeric if all characters in the string are alpha-numeric and
12171there is at least one character in the string.
12172[clinic start generated code]*/
12173
12174static PyObject *
12175unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012176/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012179 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180 Py_ssize_t len, i;
12181
12182 if (PyUnicode_READY(self) == -1)
12183 return NULL;
12184
12185 kind = PyUnicode_KIND(self);
12186 data = PyUnicode_DATA(self);
12187 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012188
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012189 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 if (len == 1) {
12191 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12192 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12193 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012194
12195 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012197 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 for (i = 0; i < len; i++) {
12200 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012201 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012202 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012203 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012204 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012205}
12206
INADA Naoki3ae20562017-01-16 20:41:20 +090012207/*[clinic input]
12208str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209
INADA Naoki3ae20562017-01-16 20:41:20 +090012210Return True if the string is a decimal string, False otherwise.
12211
12212A string is a decimal string if all characters in the string are decimal and
12213there is at least one character in the string.
12214[clinic start generated code]*/
12215
12216static PyObject *
12217unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012218/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 Py_ssize_t i, length;
12221 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012222 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223
12224 if (PyUnicode_READY(self) == -1)
12225 return NULL;
12226 length = PyUnicode_GET_LENGTH(self);
12227 kind = PyUnicode_KIND(self);
12228 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (length == 1)
12232 return PyBool_FromLong(
12233 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012235 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012237 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 for (i = 0; i < length; i++) {
12240 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012241 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012243 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244}
12245
INADA Naoki3ae20562017-01-16 20:41:20 +090012246/*[clinic input]
12247str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248
INADA Naoki3ae20562017-01-16 20:41:20 +090012249Return True if the string is a digit string, False otherwise.
12250
12251A string is a digit string if all characters in the string are digits and there
12252is at least one character in the string.
12253[clinic start generated code]*/
12254
12255static PyObject *
12256unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012257/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 Py_ssize_t i, length;
12260 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012261 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262
12263 if (PyUnicode_READY(self) == -1)
12264 return NULL;
12265 length = PyUnicode_GET_LENGTH(self);
12266 kind = PyUnicode_KIND(self);
12267 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 if (length == 1) {
12271 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12272 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012275 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012277 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 for (i = 0; i < length; i++) {
12280 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012281 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012283 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284}
12285
INADA Naoki3ae20562017-01-16 20:41:20 +090012286/*[clinic input]
12287str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288
INADA Naoki3ae20562017-01-16 20:41:20 +090012289Return True if the string is a numeric string, False otherwise.
12290
12291A string is numeric if all characters in the string are numeric and there is at
12292least one character in the string.
12293[clinic start generated code]*/
12294
12295static PyObject *
12296unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012297/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 Py_ssize_t i, length;
12300 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012301 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302
12303 if (PyUnicode_READY(self) == -1)
12304 return NULL;
12305 length = PyUnicode_GET_LENGTH(self);
12306 kind = PyUnicode_KIND(self);
12307 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (length == 1)
12311 return PyBool_FromLong(
12312 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012314 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012316 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 for (i = 0; i < length; i++) {
12319 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012320 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012322 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323}
12324
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012325Py_ssize_t
12326_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012327{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012329 if (PyUnicode_READY(self) == -1)
12330 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012331
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012332 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012333 if (len == 0) {
12334 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 }
12337
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012338 int kind = PyUnicode_KIND(self);
12339 const void *data = PyUnicode_DATA(self);
12340 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012341 /* PEP 3131 says that the first character must be in
12342 XID_Start and subsequent characters in XID_Continue,
12343 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012344 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012345 letters, digits, underscore). However, given the current
12346 definition of XID_Start and XID_Continue, it is sufficient
12347 to check just for these, except that _ must be allowed
12348 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012349 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012350 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012351 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012352
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012353 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012354 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012355 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012356 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012357 }
12358 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012359 return i;
12360}
12361
12362int
12363PyUnicode_IsIdentifier(PyObject *self)
12364{
12365 if (PyUnicode_IS_READY(self)) {
12366 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12367 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12368 /* an empty string is not a valid identifier */
12369 return len && i == len;
12370 }
12371 else {
Inada Naoki610a60c2020-06-18 17:30:53 +090012372_Py_COMP_DIAG_PUSH
12373_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012374 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012375 if (len == 0) {
12376 /* an empty string is not a valid identifier */
12377 return 0;
12378 }
12379
12380 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012381 Py_UCS4 ch = wstr[i++];
12382#if SIZEOF_WCHAR_T == 2
12383 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12384 && i < len
12385 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12386 {
12387 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12388 i++;
12389 }
12390#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012391 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12392 return 0;
12393 }
12394
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012395 while (i < len) {
12396 ch = wstr[i++];
12397#if SIZEOF_WCHAR_T == 2
12398 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12399 && i < len
12400 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12401 {
12402 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12403 i++;
12404 }
12405#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012406 if (!_PyUnicode_IsXidContinue(ch)) {
12407 return 0;
12408 }
12409 }
12410 return 1;
Inada Naoki610a60c2020-06-18 17:30:53 +090012411_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012412 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012413}
12414
INADA Naoki3ae20562017-01-16 20:41:20 +090012415/*[clinic input]
12416str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012417
INADA Naoki3ae20562017-01-16 20:41:20 +090012418Return True if the string is a valid Python identifier, False otherwise.
12419
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012420Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012421such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012422[clinic start generated code]*/
12423
12424static PyObject *
12425unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012426/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012427{
12428 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12429}
12430
INADA Naoki3ae20562017-01-16 20:41:20 +090012431/*[clinic input]
12432str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012433
INADA Naoki3ae20562017-01-16 20:41:20 +090012434Return True if the string is printable, False otherwise.
12435
12436A string is printable if all of its characters are considered printable in
12437repr() or if it is empty.
12438[clinic start generated code]*/
12439
12440static PyObject *
12441unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012442/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012443{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 Py_ssize_t i, length;
12445 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012446 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447
12448 if (PyUnicode_READY(self) == -1)
12449 return NULL;
12450 length = PyUnicode_GET_LENGTH(self);
12451 kind = PyUnicode_KIND(self);
12452 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012453
12454 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 if (length == 1)
12456 return PyBool_FromLong(
12457 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 for (i = 0; i < length; i++) {
12460 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012461 Py_RETURN_FALSE;
12462 }
12463 }
12464 Py_RETURN_TRUE;
12465}
12466
INADA Naoki3ae20562017-01-16 20:41:20 +090012467/*[clinic input]
12468str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469
INADA Naoki3ae20562017-01-16 20:41:20 +090012470 iterable: object
12471 /
12472
12473Concatenate any number of strings.
12474
Martin Panter91a88662017-01-24 00:30:06 +000012475The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012476The result is returned as a new string.
12477
12478Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12479[clinic start generated code]*/
12480
12481static PyObject *
12482unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012483/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484{
INADA Naoki3ae20562017-01-16 20:41:20 +090012485 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486}
12487
Martin v. Löwis18e16552006-02-15 17:27:45 +000012488static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012489unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491 if (PyUnicode_READY(self) == -1)
12492 return -1;
12493 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494}
12495
INADA Naoki3ae20562017-01-16 20:41:20 +090012496/*[clinic input]
12497str.ljust as unicode_ljust
12498
12499 width: Py_ssize_t
12500 fillchar: Py_UCS4 = ' '
12501 /
12502
12503Return a left-justified string of length width.
12504
12505Padding is done using the specified fill character (default is a space).
12506[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507
12508static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012509unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12510/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012512 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514
Victor Stinnerc4b49542011-12-11 22:44:26 +010012515 if (PyUnicode_GET_LENGTH(self) >= width)
12516 return unicode_result_unchanged(self);
12517
12518 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519}
12520
INADA Naoki3ae20562017-01-16 20:41:20 +090012521/*[clinic input]
12522str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523
INADA Naoki3ae20562017-01-16 20:41:20 +090012524Return a copy of the string converted to lowercase.
12525[clinic start generated code]*/
12526
12527static PyObject *
12528unicode_lower_impl(PyObject *self)
12529/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012531 if (PyUnicode_READY(self) == -1)
12532 return NULL;
12533 if (PyUnicode_IS_ASCII(self))
12534 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012535 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536}
12537
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012538#define LEFTSTRIP 0
12539#define RIGHTSTRIP 1
12540#define BOTHSTRIP 2
12541
12542/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012543static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012544
INADA Naoki3ae20562017-01-16 20:41:20 +090012545#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012546
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012547/* externally visible for str.strip(unicode) */
12548PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012549_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012550{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012551 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 int kind;
12553 Py_ssize_t i, j, len;
12554 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012555 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12558 return NULL;
12559
12560 kind = PyUnicode_KIND(self);
12561 data = PyUnicode_DATA(self);
12562 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012563 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12565 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012566 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012567
Benjamin Peterson14339b62009-01-31 16:36:08 +000012568 i = 0;
12569 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012570 while (i < len) {
12571 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12572 if (!BLOOM(sepmask, ch))
12573 break;
12574 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12575 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 i++;
12577 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012578 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012579
Benjamin Peterson14339b62009-01-31 16:36:08 +000012580 j = len;
12581 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012582 j--;
12583 while (j >= i) {
12584 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12585 if (!BLOOM(sepmask, ch))
12586 break;
12587 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12588 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012589 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012590 }
12591
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012593 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012594
Victor Stinner7931d9a2011-11-04 00:22:48 +010012595 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596}
12597
12598PyObject*
12599PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12600{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012601 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012603 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604
Victor Stinnerde636f32011-10-01 03:55:54 +020012605 if (PyUnicode_READY(self) == -1)
12606 return NULL;
12607
Victor Stinner684d5fd2012-05-03 02:32:34 +020012608 length = PyUnicode_GET_LENGTH(self);
12609 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012610
Victor Stinner684d5fd2012-05-03 02:32:34 +020012611 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012612 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613
Victor Stinnerde636f32011-10-01 03:55:54 +020012614 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012615 PyErr_SetString(PyExc_IndexError, "string index out of range");
12616 return NULL;
12617 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012618 if (start >= length || end < start)
12619 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012620
Victor Stinner684d5fd2012-05-03 02:32:34 +020012621 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012622 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012623 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012624 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012625 }
12626 else {
12627 kind = PyUnicode_KIND(self);
12628 data = PyUnicode_1BYTE_DATA(self);
12629 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012630 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012631 length);
12632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634
12635static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012636do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 Py_ssize_t len, i, j;
12639
12640 if (PyUnicode_READY(self) == -1)
12641 return NULL;
12642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012644
Victor Stinnercc7af722013-04-09 22:39:24 +020012645 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012646 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012647
12648 i = 0;
12649 if (striptype != RIGHTSTRIP) {
12650 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012651 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012652 if (!_Py_ascii_whitespace[ch])
12653 break;
12654 i++;
12655 }
12656 }
12657
12658 j = len;
12659 if (striptype != LEFTSTRIP) {
12660 j--;
12661 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012662 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012663 if (!_Py_ascii_whitespace[ch])
12664 break;
12665 j--;
12666 }
12667 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012668 }
12669 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012670 else {
12671 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012672 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012673
Victor Stinnercc7af722013-04-09 22:39:24 +020012674 i = 0;
12675 if (striptype != RIGHTSTRIP) {
12676 while (i < len) {
12677 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12678 if (!Py_UNICODE_ISSPACE(ch))
12679 break;
12680 i++;
12681 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012682 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012683
12684 j = len;
12685 if (striptype != LEFTSTRIP) {
12686 j--;
12687 while (j >= i) {
12688 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12689 if (!Py_UNICODE_ISSPACE(ch))
12690 break;
12691 j--;
12692 }
12693 j++;
12694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012695 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012696
Victor Stinner7931d9a2011-11-04 00:22:48 +010012697 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698}
12699
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012700
12701static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012702do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012703{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012704 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012705 if (PyUnicode_Check(sep))
12706 return _PyUnicode_XStrip(self, striptype, sep);
12707 else {
12708 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012709 "%s arg must be None or str",
12710 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012711 return NULL;
12712 }
12713 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012714
Benjamin Peterson14339b62009-01-31 16:36:08 +000012715 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012716}
12717
12718
INADA Naoki3ae20562017-01-16 20:41:20 +090012719/*[clinic input]
12720str.strip as unicode_strip
12721
12722 chars: object = None
12723 /
12724
Zachary Ware09895c22019-10-09 16:09:00 -050012725Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012726
12727If chars is given and not None, remove characters in chars instead.
12728[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012729
12730static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012731unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012732/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012733{
INADA Naoki3ae20562017-01-16 20:41:20 +090012734 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012735}
12736
12737
INADA Naoki3ae20562017-01-16 20:41:20 +090012738/*[clinic input]
12739str.lstrip as unicode_lstrip
12740
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012741 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012742 /
12743
12744Return a copy of the string with leading whitespace removed.
12745
12746If chars is given and not None, remove characters in chars instead.
12747[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012748
12749static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012750unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012751/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012752{
INADA Naoki3ae20562017-01-16 20:41:20 +090012753 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012754}
12755
12756
INADA Naoki3ae20562017-01-16 20:41:20 +090012757/*[clinic input]
12758str.rstrip as unicode_rstrip
12759
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012760 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012761 /
12762
12763Return a copy of the string with trailing whitespace removed.
12764
12765If chars is given and not None, remove characters in chars instead.
12766[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012767
12768static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012769unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012770/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012771{
INADA Naoki3ae20562017-01-16 20:41:20 +090012772 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012773}
12774
12775
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012777unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012779 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781
Serhiy Storchaka05997252013-01-26 12:14:02 +020012782 if (len < 1)
12783 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784
Victor Stinnerc4b49542011-12-11 22:44:26 +010012785 /* no repeat, return original string */
12786 if (len == 1)
12787 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012788
Benjamin Petersonbac79492012-01-14 13:34:47 -050012789 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790 return NULL;
12791
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012792 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012793 PyErr_SetString(PyExc_OverflowError,
12794 "repeated string is too long");
12795 return NULL;
12796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012798
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012799 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800 if (!u)
12801 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012802 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012805 int kind = PyUnicode_KIND(str);
12806 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012807 if (kind == PyUnicode_1BYTE_KIND) {
12808 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012809 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012810 }
12811 else if (kind == PyUnicode_2BYTE_KIND) {
12812 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012813 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012814 ucs2[n] = fill_char;
12815 } else {
12816 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12817 assert(kind == PyUnicode_4BYTE_KIND);
12818 for (n = 0; n < len; ++n)
12819 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 }
12822 else {
12823 /* number of characters copied this far */
12824 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012825 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012827 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012831 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012832 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012833 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834 }
12835
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012836 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012837 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838}
12839
Alexander Belopolsky40018472011-02-26 01:02:56 +000012840PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012841PyUnicode_Replace(PyObject *str,
12842 PyObject *substr,
12843 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012844 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012846 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12847 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012848 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012849 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850}
12851
INADA Naoki3ae20562017-01-16 20:41:20 +090012852/*[clinic input]
12853str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854
INADA Naoki3ae20562017-01-16 20:41:20 +090012855 old: unicode
12856 new: unicode
12857 count: Py_ssize_t = -1
12858 Maximum number of occurrences to replace.
12859 -1 (the default value) means replace all occurrences.
12860 /
12861
12862Return a copy with all occurrences of substring old replaced by new.
12863
12864If the optional argument count is given, only the first count occurrences are
12865replaced.
12866[clinic start generated code]*/
12867
12868static PyObject *
12869unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12870 Py_ssize_t count)
12871/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012873 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012875 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876}
12877
sweeneydea81849b2020-04-22 17:05:48 -040012878/*[clinic input]
12879str.removeprefix as unicode_removeprefix
12880
12881 prefix: unicode
12882 /
12883
12884Return a str with the given prefix string removed if present.
12885
12886If the string starts with the prefix string, return string[len(prefix):].
12887Otherwise, return a copy of the original string.
12888[clinic start generated code]*/
12889
12890static PyObject *
12891unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12892/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12893{
12894 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12895 if (match == -1) {
12896 return NULL;
12897 }
12898 if (match) {
12899 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12900 PyUnicode_GET_LENGTH(self));
12901 }
12902 return unicode_result_unchanged(self);
12903}
12904
12905/*[clinic input]
12906str.removesuffix as unicode_removesuffix
12907
12908 suffix: unicode
12909 /
12910
12911Return a str with the given suffix string removed if present.
12912
12913If the string ends with the suffix string and that suffix is not empty,
12914return string[:-len(suffix)]. Otherwise, return a copy of the original
12915string.
12916[clinic start generated code]*/
12917
12918static PyObject *
12919unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12920/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12921{
12922 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12923 if (match == -1) {
12924 return NULL;
12925 }
12926 if (match) {
12927 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12928 - PyUnicode_GET_LENGTH(suffix));
12929 }
12930 return unicode_result_unchanged(self);
12931}
12932
Alexander Belopolsky40018472011-02-26 01:02:56 +000012933static PyObject *
12934unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012936 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 Py_ssize_t isize;
12938 Py_ssize_t osize, squote, dquote, i, o;
12939 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012940 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012941 const void *idata;
12942 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012945 return NULL;
12946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947 isize = PyUnicode_GET_LENGTH(unicode);
12948 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 /* Compute length of output, quote characters, and
12951 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012952 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 max = 127;
12954 squote = dquote = 0;
12955 ikind = PyUnicode_KIND(unicode);
12956 for (i = 0; i < isize; i++) {
12957 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012958 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012960 case '\'': squote++; break;
12961 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012963 incr = 2;
12964 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 default:
12966 /* Fast-path ASCII */
12967 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012968 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012970 ;
12971 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012974 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012976 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012978 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012980 if (osize > PY_SSIZE_T_MAX - incr) {
12981 PyErr_SetString(PyExc_OverflowError,
12982 "string is too long to generate repr");
12983 return NULL;
12984 }
12985 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 }
12987
12988 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012989 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012991 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992 if (dquote)
12993 /* Both squote and dquote present. Use squote,
12994 and escape them */
12995 osize += squote;
12996 else
12997 quote = '"';
12998 }
Victor Stinner55c08782013-04-14 18:45:39 +020012999 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000
13001 repr = PyUnicode_New(osize, max);
13002 if (repr == NULL)
13003 return NULL;
13004 okind = PyUnicode_KIND(repr);
13005 odata = PyUnicode_DATA(repr);
13006
13007 PyUnicode_WRITE(okind, odata, 0, quote);
13008 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013009 if (unchanged) {
13010 _PyUnicode_FastCopyCharacters(repr, 1,
13011 unicode, 0,
13012 isize);
13013 }
13014 else {
13015 for (i = 0, o = 1; i < isize; i++) {
13016 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017
Victor Stinner55c08782013-04-14 18:45:39 +020013018 /* Escape quotes and backslashes */
13019 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013020 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013022 continue;
13023 }
13024
13025 /* Map special whitespace to '\t', \n', '\r' */
13026 if (ch == '\t') {
13027 PyUnicode_WRITE(okind, odata, o++, '\\');
13028 PyUnicode_WRITE(okind, odata, o++, 't');
13029 }
13030 else if (ch == '\n') {
13031 PyUnicode_WRITE(okind, odata, o++, '\\');
13032 PyUnicode_WRITE(okind, odata, o++, 'n');
13033 }
13034 else if (ch == '\r') {
13035 PyUnicode_WRITE(okind, odata, o++, '\\');
13036 PyUnicode_WRITE(okind, odata, o++, 'r');
13037 }
13038
13039 /* Map non-printable US ASCII to '\xhh' */
13040 else if (ch < ' ' || ch == 0x7F) {
13041 PyUnicode_WRITE(okind, odata, o++, '\\');
13042 PyUnicode_WRITE(okind, odata, o++, 'x');
13043 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13044 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13045 }
13046
13047 /* Copy ASCII characters as-is */
13048 else if (ch < 0x7F) {
13049 PyUnicode_WRITE(okind, odata, o++, ch);
13050 }
13051
13052 /* Non-ASCII characters */
13053 else {
13054 /* Map Unicode whitespace and control characters
13055 (categories Z* and C* except ASCII space)
13056 */
13057 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13058 PyUnicode_WRITE(okind, odata, o++, '\\');
13059 /* Map 8-bit characters to '\xhh' */
13060 if (ch <= 0xff) {
13061 PyUnicode_WRITE(okind, odata, o++, 'x');
13062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13064 }
13065 /* Map 16-bit characters to '\uxxxx' */
13066 else if (ch <= 0xffff) {
13067 PyUnicode_WRITE(okind, odata, o++, 'u');
13068 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13069 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13070 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13071 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13072 }
13073 /* Map 21-bit characters to '\U00xxxxxx' */
13074 else {
13075 PyUnicode_WRITE(okind, odata, o++, 'U');
13076 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13077 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13078 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13079 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13080 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13081 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13082 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13084 }
13085 }
13086 /* Copy characters as-is */
13087 else {
13088 PyUnicode_WRITE(okind, odata, o++, ch);
13089 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013090 }
13091 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013093 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013094 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013095 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013096}
13097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013098PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013099 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100\n\
13101Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013102such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103arguments start and end are interpreted as in slice notation.\n\
13104\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013105Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106
13107static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013110 /* initialize variables to prevent gcc warning */
13111 PyObject *substring = NULL;
13112 Py_ssize_t start = 0;
13113 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013114 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013116 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013117 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013119 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013122 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 if (result == -2)
13125 return NULL;
13126
Christian Heimes217cfd12007-12-02 14:31:20 +000013127 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128}
13129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013130PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013133Return the highest index in S where substring sub is found,\n\
13134such that sub is contained within S[start:end]. Optional\n\
13135arguments start and end are interpreted as in slice notation.\n\
13136\n\
13137Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138
13139static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013142 /* initialize variables to prevent gcc warning */
13143 PyObject *substring = NULL;
13144 Py_ssize_t start = 0;
13145 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013146 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013148 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013149 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013151 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013153
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013154 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 if (result == -2)
13157 return NULL;
13158
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159 if (result < 0) {
13160 PyErr_SetString(PyExc_ValueError, "substring not found");
13161 return NULL;
13162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163
Christian Heimes217cfd12007-12-02 14:31:20 +000013164 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165}
13166
INADA Naoki3ae20562017-01-16 20:41:20 +090013167/*[clinic input]
13168str.rjust as unicode_rjust
13169
13170 width: Py_ssize_t
13171 fillchar: Py_UCS4 = ' '
13172 /
13173
13174Return a right-justified string of length width.
13175
13176Padding is done using the specified fill character (default is a space).
13177[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013178
13179static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013180unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13181/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013183 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184 return NULL;
13185
Victor Stinnerc4b49542011-12-11 22:44:26 +010013186 if (PyUnicode_GET_LENGTH(self) >= width)
13187 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013188
Victor Stinnerc4b49542011-12-11 22:44:26 +010013189 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190}
13191
Alexander Belopolsky40018472011-02-26 01:02:56 +000013192PyObject *
13193PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013195 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013198 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199}
13200
INADA Naoki3ae20562017-01-16 20:41:20 +090013201/*[clinic input]
13202str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203
INADA Naoki3ae20562017-01-16 20:41:20 +090013204 sep: object = None
13205 The delimiter according which to split the string.
13206 None (the default value) means split according to any whitespace,
13207 and discard empty strings from the result.
13208 maxsplit: Py_ssize_t = -1
13209 Maximum number of splits to do.
13210 -1 (the default value) means no limit.
13211
13212Return a list of the words in the string, using sep as the delimiter string.
13213[clinic start generated code]*/
13214
13215static PyObject *
13216unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13217/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218{
INADA Naoki3ae20562017-01-16 20:41:20 +090013219 if (sep == Py_None)
13220 return split(self, NULL, maxsplit);
13221 if (PyUnicode_Check(sep))
13222 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013223
Victor Stinner998b8062018-09-12 00:23:25 +020013224 PyErr_Format(PyExc_TypeError,
13225 "must be str or None, not %.100s",
13226 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228}
13229
Thomas Wouters477c8d52006-05-27 19:21:47 +000013230PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013231PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013232{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013233 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013234 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013235 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013237
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013238 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013239 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013240
Victor Stinner14f8f022011-10-05 20:58:25 +020013241 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013242 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013243 len1 = PyUnicode_GET_LENGTH(str_obj);
13244 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013245 if (kind1 < kind2 || len1 < len2) {
13246 _Py_INCREF_UNICODE_EMPTY();
13247 if (!unicode_empty)
13248 out = NULL;
13249 else {
13250 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13251 Py_DECREF(unicode_empty);
13252 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013253 return out;
13254 }
13255 buf1 = PyUnicode_DATA(str_obj);
13256 buf2 = PyUnicode_DATA(sep_obj);
13257 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013258 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013259 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013260 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013261 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013262
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013263 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013265 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13266 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13267 else
13268 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013269 break;
13270 case PyUnicode_2BYTE_KIND:
13271 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13272 break;
13273 case PyUnicode_4BYTE_KIND:
13274 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13275 break;
13276 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013277 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013278 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013279
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013280 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013281 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013282 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013283
13284 return out;
13285}
13286
13287
13288PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013289PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013290{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013291 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013292 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013293 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013294 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013295
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013296 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013297 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013298
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013299 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013300 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 len1 = PyUnicode_GET_LENGTH(str_obj);
13302 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013303 if (kind1 < kind2 || len1 < len2) {
13304 _Py_INCREF_UNICODE_EMPTY();
13305 if (!unicode_empty)
13306 out = NULL;
13307 else {
13308 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13309 Py_DECREF(unicode_empty);
13310 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013311 return out;
13312 }
13313 buf1 = PyUnicode_DATA(str_obj);
13314 buf2 = PyUnicode_DATA(sep_obj);
13315 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013316 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013317 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013318 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013320
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013321 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013323 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13324 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13325 else
13326 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013327 break;
13328 case PyUnicode_2BYTE_KIND:
13329 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13330 break;
13331 case PyUnicode_4BYTE_KIND:
13332 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13333 break;
13334 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013335 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013336 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013337
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013338 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013339 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013340 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013341
13342 return out;
13343}
13344
INADA Naoki3ae20562017-01-16 20:41:20 +090013345/*[clinic input]
13346str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013347
INADA Naoki3ae20562017-01-16 20:41:20 +090013348 sep: object
13349 /
13350
13351Partition the string into three parts using the given separator.
13352
13353This will search for the separator in the string. If the separator is found,
13354returns a 3-tuple containing the part before the separator, the separator
13355itself, and the part after it.
13356
13357If the separator is not found, returns a 3-tuple containing the original string
13358and two empty strings.
13359[clinic start generated code]*/
13360
13361static PyObject *
13362unicode_partition(PyObject *self, PyObject *sep)
13363/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013364{
INADA Naoki3ae20562017-01-16 20:41:20 +090013365 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013366}
13367
INADA Naoki3ae20562017-01-16 20:41:20 +090013368/*[clinic input]
13369str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013370
INADA Naoki3ae20562017-01-16 20:41:20 +090013371Partition the string into three parts using the given separator.
13372
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013373This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013374the separator is found, returns a 3-tuple containing the part before the
13375separator, the separator itself, and the part after it.
13376
13377If the separator is not found, returns a 3-tuple containing two empty strings
13378and the original string.
13379[clinic start generated code]*/
13380
13381static PyObject *
13382unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013383/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013384{
INADA Naoki3ae20562017-01-16 20:41:20 +090013385 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013386}
13387
Alexander Belopolsky40018472011-02-26 01:02:56 +000013388PyObject *
13389PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013390{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013391 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013392 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013393
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013394 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013395}
13396
INADA Naoki3ae20562017-01-16 20:41:20 +090013397/*[clinic input]
13398str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013399
INADA Naoki3ae20562017-01-16 20:41:20 +090013400Return a list of the words in the string, using sep as the delimiter string.
13401
13402Splits are done starting at the end of the string and working to the front.
13403[clinic start generated code]*/
13404
13405static PyObject *
13406unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13407/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013408{
INADA Naoki3ae20562017-01-16 20:41:20 +090013409 if (sep == Py_None)
13410 return rsplit(self, NULL, maxsplit);
13411 if (PyUnicode_Check(sep))
13412 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013413
Victor Stinner998b8062018-09-12 00:23:25 +020013414 PyErr_Format(PyExc_TypeError,
13415 "must be str or None, not %.100s",
13416 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013417 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013418}
13419
INADA Naoki3ae20562017-01-16 20:41:20 +090013420/*[clinic input]
13421str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013423 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013424
13425Return a list of the lines in the string, breaking at line boundaries.
13426
13427Line breaks are not included in the resulting list unless keepends is given and
13428true.
13429[clinic start generated code]*/
13430
13431static PyObject *
13432unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013433/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013435 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436}
13437
13438static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013439PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013440{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013441 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442}
13443
INADA Naoki3ae20562017-01-16 20:41:20 +090013444/*[clinic input]
13445str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446
INADA Naoki3ae20562017-01-16 20:41:20 +090013447Convert uppercase characters to lowercase and lowercase characters to uppercase.
13448[clinic start generated code]*/
13449
13450static PyObject *
13451unicode_swapcase_impl(PyObject *self)
13452/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013453{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013454 if (PyUnicode_READY(self) == -1)
13455 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013456 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457}
13458
Larry Hastings61272b72014-01-07 12:41:53 -080013459/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013460
Larry Hastings31826802013-10-19 00:09:25 -070013461@staticmethod
13462str.maketrans as unicode_maketrans
13463
13464 x: object
13465
13466 y: unicode=NULL
13467
13468 z: unicode=NULL
13469
13470 /
13471
13472Return a translation table usable for str.translate().
13473
13474If there is only one argument, it must be a dictionary mapping Unicode
13475ordinals (integers) or characters to Unicode ordinals, strings or None.
13476Character keys will be then converted to ordinals.
13477If there are two arguments, they must be strings of equal length, and
13478in the resulting dictionary, each character in x will be mapped to the
13479character at the same position in y. If there is a third argument, it
13480must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013481[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013482
Larry Hastings31826802013-10-19 00:09:25 -070013483static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013484unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013485/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013486{
Georg Brandlceee0772007-11-27 23:48:05 +000013487 PyObject *new = NULL, *key, *value;
13488 Py_ssize_t i = 0;
13489 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013490
Georg Brandlceee0772007-11-27 23:48:05 +000013491 new = PyDict_New();
13492 if (!new)
13493 return NULL;
13494 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013495 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013496 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013497
Georg Brandlceee0772007-11-27 23:48:05 +000013498 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013499 if (!PyUnicode_Check(x)) {
13500 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13501 "be a string if there is a second argument");
13502 goto err;
13503 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013504 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013505 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13506 "arguments must have equal length");
13507 goto err;
13508 }
13509 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013510 x_kind = PyUnicode_KIND(x);
13511 y_kind = PyUnicode_KIND(y);
13512 x_data = PyUnicode_DATA(x);
13513 y_data = PyUnicode_DATA(y);
13514 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13515 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013516 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013517 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013518 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013519 if (!value) {
13520 Py_DECREF(key);
13521 goto err;
13522 }
Georg Brandlceee0772007-11-27 23:48:05 +000013523 res = PyDict_SetItem(new, key, value);
13524 Py_DECREF(key);
13525 Py_DECREF(value);
13526 if (res < 0)
13527 goto err;
13528 }
13529 /* create entries for deleting chars in z */
13530 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013531 z_kind = PyUnicode_KIND(z);
13532 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013533 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013534 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013535 if (!key)
13536 goto err;
13537 res = PyDict_SetItem(new, key, Py_None);
13538 Py_DECREF(key);
13539 if (res < 0)
13540 goto err;
13541 }
13542 }
13543 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013544 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013545 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013546
Georg Brandlceee0772007-11-27 23:48:05 +000013547 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013548 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013549 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13550 "to maketrans it must be a dict");
13551 goto err;
13552 }
13553 /* copy entries into the new dict, converting string keys to int keys */
13554 while (PyDict_Next(x, &i, &key, &value)) {
13555 if (PyUnicode_Check(key)) {
13556 /* convert string keys to integer keys */
13557 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013558 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013559 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13560 "table must be of length 1");
13561 goto err;
13562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013563 kind = PyUnicode_KIND(key);
13564 data = PyUnicode_DATA(key);
13565 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013566 if (!newkey)
13567 goto err;
13568 res = PyDict_SetItem(new, newkey, value);
13569 Py_DECREF(newkey);
13570 if (res < 0)
13571 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013572 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013573 /* just keep integer keys */
13574 if (PyDict_SetItem(new, key, value) < 0)
13575 goto err;
13576 } else {
13577 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13578 "be strings or integers");
13579 goto err;
13580 }
13581 }
13582 }
13583 return new;
13584 err:
13585 Py_DECREF(new);
13586 return NULL;
13587}
13588
INADA Naoki3ae20562017-01-16 20:41:20 +090013589/*[clinic input]
13590str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013591
INADA Naoki3ae20562017-01-16 20:41:20 +090013592 table: object
13593 Translation table, which must be a mapping of Unicode ordinals to
13594 Unicode ordinals, strings, or None.
13595 /
13596
13597Replace each character in the string using the given translation table.
13598
13599The table must implement lookup/indexing via __getitem__, for instance a
13600dictionary or list. If this operation raises LookupError, the character is
13601left untouched. Characters mapped to None are deleted.
13602[clinic start generated code]*/
13603
13604static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013605unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013606/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013608 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013609}
13610
INADA Naoki3ae20562017-01-16 20:41:20 +090013611/*[clinic input]
13612str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013613
INADA Naoki3ae20562017-01-16 20:41:20 +090013614Return a copy of the string converted to uppercase.
13615[clinic start generated code]*/
13616
13617static PyObject *
13618unicode_upper_impl(PyObject *self)
13619/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013620{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013621 if (PyUnicode_READY(self) == -1)
13622 return NULL;
13623 if (PyUnicode_IS_ASCII(self))
13624 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013625 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013626}
13627
INADA Naoki3ae20562017-01-16 20:41:20 +090013628/*[clinic input]
13629str.zfill as unicode_zfill
13630
13631 width: Py_ssize_t
13632 /
13633
13634Pad a numeric string with zeros on the left, to fill a field of the given width.
13635
13636The string is never truncated.
13637[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013638
13639static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013640unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013641/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013642{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013643 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013644 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013645 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013646 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013647 Py_UCS4 chr;
13648
Benjamin Petersonbac79492012-01-14 13:34:47 -050013649 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013651
Victor Stinnerc4b49542011-12-11 22:44:26 +010013652 if (PyUnicode_GET_LENGTH(self) >= width)
13653 return unicode_result_unchanged(self);
13654
13655 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013656
13657 u = pad(self, fill, 0, '0');
13658
Walter Dörwald068325e2002-04-15 13:36:47 +000013659 if (u == NULL)
13660 return NULL;
13661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013662 kind = PyUnicode_KIND(u);
13663 data = PyUnicode_DATA(u);
13664 chr = PyUnicode_READ(kind, data, fill);
13665
13666 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013667 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013668 PyUnicode_WRITE(kind, data, 0, chr);
13669 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013670 }
13671
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013672 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013673 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013674}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013675
13676#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013677static PyObject *
13678unicode__decimal2ascii(PyObject *self)
13679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013680 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013681}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013682#endif
13683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013684PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013685 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013686\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013687Return True if S starts with the specified prefix, False otherwise.\n\
13688With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013689With optional end, stop comparing S at that position.\n\
13690prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013691
13692static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013693unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013694 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013695{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013696 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013697 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013698 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013699 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013700 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013701
Jesus Ceaac451502011-04-20 17:09:23 +020013702 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013703 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013704 if (PyTuple_Check(subobj)) {
13705 Py_ssize_t i;
13706 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013707 substring = PyTuple_GET_ITEM(subobj, i);
13708 if (!PyUnicode_Check(substring)) {
13709 PyErr_Format(PyExc_TypeError,
13710 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013711 "not %.100s",
13712 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013713 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013714 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013715 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013716 if (result == -1)
13717 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013718 if (result) {
13719 Py_RETURN_TRUE;
13720 }
13721 }
13722 /* nothing matched */
13723 Py_RETURN_FALSE;
13724 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013725 if (!PyUnicode_Check(subobj)) {
13726 PyErr_Format(PyExc_TypeError,
13727 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013728 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013729 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013730 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013731 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013732 if (result == -1)
13733 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013734 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013735}
13736
13737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013738PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013739 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013740\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013741Return True if S ends with the specified suffix, False otherwise.\n\
13742With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013743With optional end, stop comparing S at that position.\n\
13744suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013745
13746static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013747unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013748 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013749{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013750 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013751 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013752 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013753 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013754 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013755
Jesus Ceaac451502011-04-20 17:09:23 +020013756 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013757 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013758 if (PyTuple_Check(subobj)) {
13759 Py_ssize_t i;
13760 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013761 substring = PyTuple_GET_ITEM(subobj, i);
13762 if (!PyUnicode_Check(substring)) {
13763 PyErr_Format(PyExc_TypeError,
13764 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013765 "not %.100s",
13766 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013767 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013768 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013769 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013770 if (result == -1)
13771 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013772 if (result) {
13773 Py_RETURN_TRUE;
13774 }
13775 }
13776 Py_RETURN_FALSE;
13777 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013778 if (!PyUnicode_Check(subobj)) {
13779 PyErr_Format(PyExc_TypeError,
13780 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013781 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013782 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013783 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013784 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013785 if (result == -1)
13786 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013787 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013788}
13789
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013790static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013791_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013792{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013793 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13794 writer->data = PyUnicode_DATA(writer->buffer);
13795
13796 if (!writer->readonly) {
13797 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013798 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013799 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013800 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013801 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13802 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13803 writer->kind = PyUnicode_WCHAR_KIND;
13804 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13805
Victor Stinner8f674cc2013-04-17 23:02:17 +020013806 /* Copy-on-write mode: set buffer size to 0 so
13807 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13808 * next write. */
13809 writer->size = 0;
13810 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013811}
13812
Victor Stinnerd3f08822012-05-29 12:57:52 +020013813void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013814_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013815{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013816 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013817
13818 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013819 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013820
13821 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13822 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13823 writer->kind = PyUnicode_WCHAR_KIND;
13824 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013825}
13826
Inada Naoki770847a2019-06-24 12:30:24 +090013827// Initialize _PyUnicodeWriter with initial buffer
13828static inline void
13829_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13830{
13831 memset(writer, 0, sizeof(*writer));
13832 writer->buffer = buffer;
13833 _PyUnicodeWriter_Update(writer);
13834 writer->min_length = writer->size;
13835}
13836
Victor Stinnerd3f08822012-05-29 12:57:52 +020013837int
13838_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13839 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013840{
13841 Py_ssize_t newlen;
13842 PyObject *newbuffer;
13843
Victor Stinner2740e462016-09-06 16:58:36 -070013844 assert(maxchar <= MAX_UNICODE);
13845
Victor Stinnerca9381e2015-09-22 00:58:32 +020013846 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013847 assert((maxchar > writer->maxchar && length >= 0)
13848 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013849
Victor Stinner202fdca2012-05-07 12:47:02 +020013850 if (length > PY_SSIZE_T_MAX - writer->pos) {
13851 PyErr_NoMemory();
13852 return -1;
13853 }
13854 newlen = writer->pos + length;
13855
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013856 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013857
Victor Stinnerd3f08822012-05-29 12:57:52 +020013858 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013859 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013860 if (writer->overallocate
13861 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13862 /* overallocate to limit the number of realloc() */
13863 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013864 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013865 if (newlen < writer->min_length)
13866 newlen = writer->min_length;
13867
Victor Stinnerd3f08822012-05-29 12:57:52 +020013868 writer->buffer = PyUnicode_New(newlen, maxchar);
13869 if (writer->buffer == NULL)
13870 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013871 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013872 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013873 if (writer->overallocate
13874 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13875 /* overallocate to limit the number of realloc() */
13876 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013877 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013878 if (newlen < writer->min_length)
13879 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013880
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013881 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013882 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013883 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013884 newbuffer = PyUnicode_New(newlen, maxchar);
13885 if (newbuffer == NULL)
13886 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013887 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13888 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013889 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013890 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013891 }
13892 else {
13893 newbuffer = resize_compact(writer->buffer, newlen);
13894 if (newbuffer == NULL)
13895 return -1;
13896 }
13897 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013898 }
13899 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013900 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013901 newbuffer = PyUnicode_New(writer->size, maxchar);
13902 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013903 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013904 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13905 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013906 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013907 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013908 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013909 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013910
13911#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013912}
13913
Victor Stinnerca9381e2015-09-22 00:58:32 +020013914int
13915_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13916 enum PyUnicode_Kind kind)
13917{
13918 Py_UCS4 maxchar;
13919
13920 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13921 assert(writer->kind < kind);
13922
13923 switch (kind)
13924 {
13925 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13926 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13927 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13928 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013929 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013930 }
13931
13932 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13933}
13934
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013935static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013936_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013937{
Victor Stinner2740e462016-09-06 16:58:36 -070013938 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013939 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13940 return -1;
13941 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13942 writer->pos++;
13943 return 0;
13944}
13945
13946int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013947_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13948{
13949 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13950}
13951
13952int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013953_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13954{
13955 Py_UCS4 maxchar;
13956 Py_ssize_t len;
13957
13958 if (PyUnicode_READY(str) == -1)
13959 return -1;
13960 len = PyUnicode_GET_LENGTH(str);
13961 if (len == 0)
13962 return 0;
13963 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13964 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013965 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013966 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013967 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013968 Py_INCREF(str);
13969 writer->buffer = str;
13970 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013971 writer->pos += len;
13972 return 0;
13973 }
13974 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13975 return -1;
13976 }
13977 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13978 str, 0, len);
13979 writer->pos += len;
13980 return 0;
13981}
13982
Victor Stinnere215d962012-10-06 23:03:36 +020013983int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013984_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13985 Py_ssize_t start, Py_ssize_t end)
13986{
13987 Py_UCS4 maxchar;
13988 Py_ssize_t len;
13989
13990 if (PyUnicode_READY(str) == -1)
13991 return -1;
13992
13993 assert(0 <= start);
13994 assert(end <= PyUnicode_GET_LENGTH(str));
13995 assert(start <= end);
13996
13997 if (end == 0)
13998 return 0;
13999
14000 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14001 return _PyUnicodeWriter_WriteStr(writer, str);
14002
14003 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14004 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14005 else
14006 maxchar = writer->maxchar;
14007 len = end - start;
14008
14009 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14010 return -1;
14011
14012 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14013 str, start, len);
14014 writer->pos += len;
14015 return 0;
14016}
14017
14018int
Victor Stinner4a587072013-11-19 12:54:53 +010014019_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14020 const char *ascii, Py_ssize_t len)
14021{
14022 if (len == -1)
14023 len = strlen(ascii);
14024
Andy Lestere6be9b52020-02-11 20:28:35 -060014025 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014026
14027 if (writer->buffer == NULL && !writer->overallocate) {
14028 PyObject *str;
14029
14030 str = _PyUnicode_FromASCII(ascii, len);
14031 if (str == NULL)
14032 return -1;
14033
14034 writer->readonly = 1;
14035 writer->buffer = str;
14036 _PyUnicodeWriter_Update(writer);
14037 writer->pos += len;
14038 return 0;
14039 }
14040
14041 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14042 return -1;
14043
14044 switch (writer->kind)
14045 {
14046 case PyUnicode_1BYTE_KIND:
14047 {
14048 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14049 Py_UCS1 *data = writer->data;
14050
Christian Heimesf051e432016-09-13 20:22:02 +020014051 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014052 break;
14053 }
14054 case PyUnicode_2BYTE_KIND:
14055 {
14056 _PyUnicode_CONVERT_BYTES(
14057 Py_UCS1, Py_UCS2,
14058 ascii, ascii + len,
14059 (Py_UCS2 *)writer->data + writer->pos);
14060 break;
14061 }
14062 case PyUnicode_4BYTE_KIND:
14063 {
14064 _PyUnicode_CONVERT_BYTES(
14065 Py_UCS1, Py_UCS4,
14066 ascii, ascii + len,
14067 (Py_UCS4 *)writer->data + writer->pos);
14068 break;
14069 }
14070 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014071 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014072 }
14073
14074 writer->pos += len;
14075 return 0;
14076}
14077
14078int
14079_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14080 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014081{
14082 Py_UCS4 maxchar;
14083
Andy Lestere6be9b52020-02-11 20:28:35 -060014084 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014085 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14086 return -1;
14087 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14088 writer->pos += len;
14089 return 0;
14090}
14091
Victor Stinnerd3f08822012-05-29 12:57:52 +020014092PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014093_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014094{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014095 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014096
Victor Stinnerd3f08822012-05-29 12:57:52 +020014097 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014098 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014099 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014100 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014101
14102 str = writer->buffer;
14103 writer->buffer = NULL;
14104
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014105 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014106 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14107 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014108 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014109
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014110 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14111 PyObject *str2;
14112 str2 = resize_compact(str, writer->pos);
14113 if (str2 == NULL) {
14114 Py_DECREF(str);
14115 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014116 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014117 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014118 }
14119
Victor Stinner15a0bd32013-07-08 22:29:55 +020014120 assert(_PyUnicode_CheckConsistency(str, 1));
14121 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014122}
14123
Victor Stinnerd3f08822012-05-29 12:57:52 +020014124void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014125_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014126{
14127 Py_CLEAR(writer->buffer);
14128}
14129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014130#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014131
14132PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014133 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014134\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014135Return a formatted version of S, using substitutions from args and kwargs.\n\
14136The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014137
Eric Smith27bbca62010-11-04 17:06:58 +000014138PyDoc_STRVAR(format_map__doc__,
14139 "S.format_map(mapping) -> str\n\
14140\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014141Return a formatted version of S, using substitutions from mapping.\n\
14142The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014143
INADA Naoki3ae20562017-01-16 20:41:20 +090014144/*[clinic input]
14145str.__format__ as unicode___format__
14146
14147 format_spec: unicode
14148 /
14149
14150Return a formatted version of the string as described by format_spec.
14151[clinic start generated code]*/
14152
Eric Smith4a7d76d2008-05-30 18:10:19 +000014153static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014154unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014155/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014156{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014157 _PyUnicodeWriter writer;
14158 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014159
Victor Stinnerd3f08822012-05-29 12:57:52 +020014160 if (PyUnicode_READY(self) == -1)
14161 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014162 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014163 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14164 self, format_spec, 0,
14165 PyUnicode_GET_LENGTH(format_spec));
14166 if (ret == -1) {
14167 _PyUnicodeWriter_Dealloc(&writer);
14168 return NULL;
14169 }
14170 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014171}
14172
INADA Naoki3ae20562017-01-16 20:41:20 +090014173/*[clinic input]
14174str.__sizeof__ as unicode_sizeof
14175
14176Return the size of the string in memory, in bytes.
14177[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014178
14179static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014180unicode_sizeof_impl(PyObject *self)
14181/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014182{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014183 Py_ssize_t size;
14184
14185 /* If it's a compact object, account for base structure +
14186 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014187 if (PyUnicode_IS_COMPACT_ASCII(self))
14188 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14189 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014190 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014191 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014192 else {
14193 /* If it is a two-block object, account for base object, and
14194 for character block if present. */
14195 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014196 if (_PyUnicode_DATA_ANY(self))
14197 size += (PyUnicode_GET_LENGTH(self) + 1) *
14198 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014199 }
14200 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014201 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014202 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14203 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14204 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14205 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014206
14207 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014208}
14209
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014210static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014211unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014212{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014213 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014214 if (!copy)
14215 return NULL;
14216 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014217}
14218
Guido van Rossumd57fd912000-03-10 22:53:23 +000014219static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014220 UNICODE_ENCODE_METHODDEF
14221 UNICODE_REPLACE_METHODDEF
14222 UNICODE_SPLIT_METHODDEF
14223 UNICODE_RSPLIT_METHODDEF
14224 UNICODE_JOIN_METHODDEF
14225 UNICODE_CAPITALIZE_METHODDEF
14226 UNICODE_CASEFOLD_METHODDEF
14227 UNICODE_TITLE_METHODDEF
14228 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014229 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014230 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014231 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014232 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014233 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014234 UNICODE_LJUST_METHODDEF
14235 UNICODE_LOWER_METHODDEF
14236 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014237 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14238 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014239 UNICODE_RJUST_METHODDEF
14240 UNICODE_RSTRIP_METHODDEF
14241 UNICODE_RPARTITION_METHODDEF
14242 UNICODE_SPLITLINES_METHODDEF
14243 UNICODE_STRIP_METHODDEF
14244 UNICODE_SWAPCASE_METHODDEF
14245 UNICODE_TRANSLATE_METHODDEF
14246 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014247 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14248 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014249 UNICODE_REMOVEPREFIX_METHODDEF
14250 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014251 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014252 UNICODE_ISLOWER_METHODDEF
14253 UNICODE_ISUPPER_METHODDEF
14254 UNICODE_ISTITLE_METHODDEF
14255 UNICODE_ISSPACE_METHODDEF
14256 UNICODE_ISDECIMAL_METHODDEF
14257 UNICODE_ISDIGIT_METHODDEF
14258 UNICODE_ISNUMERIC_METHODDEF
14259 UNICODE_ISALPHA_METHODDEF
14260 UNICODE_ISALNUM_METHODDEF
14261 UNICODE_ISIDENTIFIER_METHODDEF
14262 UNICODE_ISPRINTABLE_METHODDEF
14263 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014264 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014265 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014266 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014267 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014268 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014269#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014270 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014271 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014272#endif
14273
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014274 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014275 {NULL, NULL}
14276};
14277
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014278static PyObject *
14279unicode_mod(PyObject *v, PyObject *w)
14280{
Brian Curtindfc80e32011-08-10 20:28:54 -050014281 if (!PyUnicode_Check(v))
14282 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014283 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014284}
14285
14286static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014287 0, /*nb_add*/
14288 0, /*nb_subtract*/
14289 0, /*nb_multiply*/
14290 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014291};
14292
Guido van Rossumd57fd912000-03-10 22:53:23 +000014293static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014294 (lenfunc) unicode_length, /* sq_length */
14295 PyUnicode_Concat, /* sq_concat */
14296 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14297 (ssizeargfunc) unicode_getitem, /* sq_item */
14298 0, /* sq_slice */
14299 0, /* sq_ass_item */
14300 0, /* sq_ass_slice */
14301 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014302};
14303
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014304static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014305unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014307 if (PyUnicode_READY(self) == -1)
14308 return NULL;
14309
Victor Stinnera15e2602020-04-08 02:01:56 +020014310 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014311 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014312 if (i == -1 && PyErr_Occurred())
14313 return NULL;
14314 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014315 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014316 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014317 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014318 Py_ssize_t start, stop, step, slicelength, i;
14319 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014320 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014321 const void *src_data;
14322 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014323 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014324 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014325
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014326 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014327 return NULL;
14328 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014329 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14330 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014331
14332 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014333 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014334 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014335 slicelength == PyUnicode_GET_LENGTH(self)) {
14336 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014337 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014338 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014339 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014340 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014341 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014342 src_kind = PyUnicode_KIND(self);
14343 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014344 if (!PyUnicode_IS_ASCII(self)) {
14345 kind_limit = kind_maxchar_limit(src_kind);
14346 max_char = 0;
14347 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14348 ch = PyUnicode_READ(src_kind, src_data, cur);
14349 if (ch > max_char) {
14350 max_char = ch;
14351 if (max_char >= kind_limit)
14352 break;
14353 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014354 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014355 }
Victor Stinner55c99112011-10-13 01:17:06 +020014356 else
14357 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014358 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014359 if (result == NULL)
14360 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014361 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014362 dest_data = PyUnicode_DATA(result);
14363
14364 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014365 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14366 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014367 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014368 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014369 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014370 } else {
14371 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14372 return NULL;
14373 }
14374}
14375
14376static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014377 (lenfunc)unicode_length, /* mp_length */
14378 (binaryfunc)unicode_subscript, /* mp_subscript */
14379 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014380};
14381
Guido van Rossumd57fd912000-03-10 22:53:23 +000014382
Guido van Rossumd57fd912000-03-10 22:53:23 +000014383/* Helpers for PyUnicode_Format() */
14384
Victor Stinnera47082312012-10-04 02:19:54 +020014385struct unicode_formatter_t {
14386 PyObject *args;
14387 int args_owned;
14388 Py_ssize_t arglen, argidx;
14389 PyObject *dict;
14390
14391 enum PyUnicode_Kind fmtkind;
14392 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014393 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014394 PyObject *fmtstr;
14395
14396 _PyUnicodeWriter writer;
14397};
14398
14399struct unicode_format_arg_t {
14400 Py_UCS4 ch;
14401 int flags;
14402 Py_ssize_t width;
14403 int prec;
14404 int sign;
14405};
14406
Guido van Rossumd57fd912000-03-10 22:53:23 +000014407static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014408unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014409{
Victor Stinnera47082312012-10-04 02:19:54 +020014410 Py_ssize_t argidx = ctx->argidx;
14411
14412 if (argidx < ctx->arglen) {
14413 ctx->argidx++;
14414 if (ctx->arglen < 0)
14415 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014416 else
Victor Stinnera47082312012-10-04 02:19:54 +020014417 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014418 }
14419 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014420 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014421 return NULL;
14422}
14423
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014424/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014425
Victor Stinnera47082312012-10-04 02:19:54 +020014426/* Format a float into the writer if the writer is not NULL, or into *p_output
14427 otherwise.
14428
14429 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014430static int
Victor Stinnera47082312012-10-04 02:19:54 +020014431formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14432 PyObject **p_output,
14433 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014434{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014435 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014436 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014437 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014438 int prec;
14439 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014440
Guido van Rossumd57fd912000-03-10 22:53:23 +000014441 x = PyFloat_AsDouble(v);
14442 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014443 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014444
Victor Stinnera47082312012-10-04 02:19:54 +020014445 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014446 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014447 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014448
Victor Stinnera47082312012-10-04 02:19:54 +020014449 if (arg->flags & F_ALT)
14450 dtoa_flags = Py_DTSF_ALT;
14451 else
14452 dtoa_flags = 0;
14453 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014454 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014455 return -1;
14456 len = strlen(p);
14457 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014458 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014459 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014460 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014461 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014462 }
14463 else
14464 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014465 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014466 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014467}
14468
Victor Stinnerd0880d52012-04-27 23:40:13 +020014469/* formatlong() emulates the format codes d, u, o, x and X, and
14470 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14471 * Python's regular ints.
14472 * Return value: a new PyUnicodeObject*, or NULL if error.
14473 * The output string is of the form
14474 * "-"? ("0x" | "0X")? digit+
14475 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14476 * set in flags. The case of hex digits will be correct,
14477 * There will be at least prec digits, zero-filled on the left if
14478 * necessary to get that many.
14479 * val object to be converted
14480 * flags bitmask of format flags; only F_ALT is looked at
14481 * prec minimum number of digits; 0-fill on left if needed
14482 * type a character in [duoxX]; u acts the same as d
14483 *
14484 * CAUTION: o, x and X conversions on regular ints can never
14485 * produce a '-' sign, but can for Python's unbounded ints.
14486 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014487PyObject *
14488_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014489{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014490 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014491 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014492 Py_ssize_t i;
14493 int sign; /* 1 if '-', else 0 */
14494 int len; /* number of characters */
14495 Py_ssize_t llen;
14496 int numdigits; /* len == numnondigits + numdigits */
14497 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014498
Victor Stinnerd0880d52012-04-27 23:40:13 +020014499 /* Avoid exceeding SSIZE_T_MAX */
14500 if (prec > INT_MAX-3) {
14501 PyErr_SetString(PyExc_OverflowError,
14502 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014503 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014504 }
14505
14506 assert(PyLong_Check(val));
14507
14508 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014509 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014510 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014511 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014512 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014513 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014514 /* int and int subclasses should print numerically when a numeric */
14515 /* format code is used (see issue18780) */
14516 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014517 break;
14518 case 'o':
14519 numnondigits = 2;
14520 result = PyNumber_ToBase(val, 8);
14521 break;
14522 case 'x':
14523 case 'X':
14524 numnondigits = 2;
14525 result = PyNumber_ToBase(val, 16);
14526 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014527 }
14528 if (!result)
14529 return NULL;
14530
14531 assert(unicode_modifiable(result));
14532 assert(PyUnicode_IS_READY(result));
14533 assert(PyUnicode_IS_ASCII(result));
14534
14535 /* To modify the string in-place, there can only be one reference. */
14536 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014537 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014538 PyErr_BadInternalCall();
14539 return NULL;
14540 }
14541 buf = PyUnicode_DATA(result);
14542 llen = PyUnicode_GET_LENGTH(result);
14543 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014544 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014545 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014546 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014547 return NULL;
14548 }
14549 len = (int)llen;
14550 sign = buf[0] == '-';
14551 numnondigits += sign;
14552 numdigits = len - numnondigits;
14553 assert(numdigits > 0);
14554
14555 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014556 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014557 (type == 'o' || type == 'x' || type == 'X'))) {
14558 assert(buf[sign] == '0');
14559 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14560 buf[sign+1] == 'o');
14561 numnondigits -= 2;
14562 buf += 2;
14563 len -= 2;
14564 if (sign)
14565 buf[0] = '-';
14566 assert(len == numnondigits + numdigits);
14567 assert(numdigits > 0);
14568 }
14569
14570 /* Fill with leading zeroes to meet minimum width. */
14571 if (prec > numdigits) {
14572 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14573 numnondigits + prec);
14574 char *b1;
14575 if (!r1) {
14576 Py_DECREF(result);
14577 return NULL;
14578 }
14579 b1 = PyBytes_AS_STRING(r1);
14580 for (i = 0; i < numnondigits; ++i)
14581 *b1++ = *buf++;
14582 for (i = 0; i < prec - numdigits; i++)
14583 *b1++ = '0';
14584 for (i = 0; i < numdigits; i++)
14585 *b1++ = *buf++;
14586 *b1 = '\0';
14587 Py_DECREF(result);
14588 result = r1;
14589 buf = PyBytes_AS_STRING(result);
14590 len = numnondigits + prec;
14591 }
14592
14593 /* Fix up case for hex conversions. */
14594 if (type == 'X') {
14595 /* Need to convert all lower case letters to upper case.
14596 and need to convert 0x to 0X (and -0x to -0X). */
14597 for (i = 0; i < len; i++)
14598 if (buf[i] >= 'a' && buf[i] <= 'x')
14599 buf[i] -= 'a'-'A';
14600 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014601 if (!PyUnicode_Check(result)
14602 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014603 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014604 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014605 Py_DECREF(result);
14606 result = unicode;
14607 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014608 else if (len != PyUnicode_GET_LENGTH(result)) {
14609 if (PyUnicode_Resize(&result, len) < 0)
14610 Py_CLEAR(result);
14611 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014612 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014613}
14614
Ethan Furmandf3ed242014-01-05 06:50:30 -080014615/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014616 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014617 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014618 * -1 and raise an exception on error */
14619static int
Victor Stinnera47082312012-10-04 02:19:54 +020014620mainformatlong(PyObject *v,
14621 struct unicode_format_arg_t *arg,
14622 PyObject **p_output,
14623 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014624{
14625 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014626 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014627
14628 if (!PyNumber_Check(v))
14629 goto wrongtype;
14630
Ethan Furman9ab74802014-03-21 06:38:46 -070014631 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014632 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014633 if (type == 'o' || type == 'x' || type == 'X') {
14634 iobj = PyNumber_Index(v);
14635 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014636 if (PyErr_ExceptionMatches(PyExc_TypeError))
14637 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014638 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014639 }
14640 }
14641 else {
14642 iobj = PyNumber_Long(v);
14643 if (iobj == NULL ) {
14644 if (PyErr_ExceptionMatches(PyExc_TypeError))
14645 goto wrongtype;
14646 return -1;
14647 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014648 }
14649 assert(PyLong_Check(iobj));
14650 }
14651 else {
14652 iobj = v;
14653 Py_INCREF(iobj);
14654 }
14655
14656 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014657 && arg->width == -1 && arg->prec == -1
14658 && !(arg->flags & (F_SIGN | F_BLANK))
14659 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014660 {
14661 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014662 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014663 int base;
14664
Victor Stinnera47082312012-10-04 02:19:54 +020014665 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014666 {
14667 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014668 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014669 case 'd':
14670 case 'i':
14671 case 'u':
14672 base = 10;
14673 break;
14674 case 'o':
14675 base = 8;
14676 break;
14677 case 'x':
14678 case 'X':
14679 base = 16;
14680 break;
14681 }
14682
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014683 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14684 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014685 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014686 }
14687 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014688 return 1;
14689 }
14690
Ethan Furmanb95b5612015-01-23 20:05:18 -080014691 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014692 Py_DECREF(iobj);
14693 if (res == NULL)
14694 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014695 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014696 return 0;
14697
14698wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014699 switch(type)
14700 {
14701 case 'o':
14702 case 'x':
14703 case 'X':
14704 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014705 "%%%c format: an integer is required, "
14706 "not %.200s",
14707 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014708 break;
14709 default:
14710 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014711 "%%%c format: a number is required, "
14712 "not %.200s",
14713 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014714 break;
14715 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014716 return -1;
14717}
14718
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014719static Py_UCS4
14720formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014721{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014722 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014723 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014724 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014725 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014726 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014727 goto onError;
14728 }
14729 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014730 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014731 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014732 /* make sure number is a type of integer */
14733 if (!PyLong_Check(v)) {
14734 iobj = PyNumber_Index(v);
14735 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014736 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014737 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014738 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014739 Py_DECREF(iobj);
14740 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014741 else {
14742 x = PyLong_AsLong(v);
14743 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014744 if (x == -1 && PyErr_Occurred())
14745 goto onError;
14746
Victor Stinner8faf8212011-12-08 22:14:11 +010014747 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014748 PyErr_SetString(PyExc_OverflowError,
14749 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014750 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014751 }
14752
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014753 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014754 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014755
Benjamin Peterson29060642009-01-31 22:14:21 +000014756 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014757 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014758 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014759 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014760}
14761
Victor Stinnera47082312012-10-04 02:19:54 +020014762/* Parse options of an argument: flags, width, precision.
14763 Handle also "%(name)" syntax.
14764
14765 Return 0 if the argument has been formatted into arg->str.
14766 Return 1 if the argument has been written into ctx->writer,
14767 Raise an exception and return -1 on error. */
14768static int
14769unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14770 struct unicode_format_arg_t *arg)
14771{
14772#define FORMAT_READ(ctx) \
14773 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14774
14775 PyObject *v;
14776
Victor Stinnera47082312012-10-04 02:19:54 +020014777 if (arg->ch == '(') {
14778 /* Get argument value from a dictionary. Example: "%(name)s". */
14779 Py_ssize_t keystart;
14780 Py_ssize_t keylen;
14781 PyObject *key;
14782 int pcount = 1;
14783
14784 if (ctx->dict == NULL) {
14785 PyErr_SetString(PyExc_TypeError,
14786 "format requires a mapping");
14787 return -1;
14788 }
14789 ++ctx->fmtpos;
14790 --ctx->fmtcnt;
14791 keystart = ctx->fmtpos;
14792 /* Skip over balanced parentheses */
14793 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14794 arg->ch = FORMAT_READ(ctx);
14795 if (arg->ch == ')')
14796 --pcount;
14797 else if (arg->ch == '(')
14798 ++pcount;
14799 ctx->fmtpos++;
14800 }
14801 keylen = ctx->fmtpos - keystart - 1;
14802 if (ctx->fmtcnt < 0 || pcount > 0) {
14803 PyErr_SetString(PyExc_ValueError,
14804 "incomplete format key");
14805 return -1;
14806 }
14807 key = PyUnicode_Substring(ctx->fmtstr,
14808 keystart, keystart + keylen);
14809 if (key == NULL)
14810 return -1;
14811 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014812 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014813 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014814 }
14815 ctx->args = PyObject_GetItem(ctx->dict, key);
14816 Py_DECREF(key);
14817 if (ctx->args == NULL)
14818 return -1;
14819 ctx->args_owned = 1;
14820 ctx->arglen = -1;
14821 ctx->argidx = -2;
14822 }
14823
14824 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014825 while (--ctx->fmtcnt >= 0) {
14826 arg->ch = FORMAT_READ(ctx);
14827 ctx->fmtpos++;
14828 switch (arg->ch) {
14829 case '-': arg->flags |= F_LJUST; continue;
14830 case '+': arg->flags |= F_SIGN; continue;
14831 case ' ': arg->flags |= F_BLANK; continue;
14832 case '#': arg->flags |= F_ALT; continue;
14833 case '0': arg->flags |= F_ZERO; continue;
14834 }
14835 break;
14836 }
14837
14838 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014839 if (arg->ch == '*') {
14840 v = unicode_format_getnextarg(ctx);
14841 if (v == NULL)
14842 return -1;
14843 if (!PyLong_Check(v)) {
14844 PyErr_SetString(PyExc_TypeError,
14845 "* wants int");
14846 return -1;
14847 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014848 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014849 if (arg->width == -1 && PyErr_Occurred())
14850 return -1;
14851 if (arg->width < 0) {
14852 arg->flags |= F_LJUST;
14853 arg->width = -arg->width;
14854 }
14855 if (--ctx->fmtcnt >= 0) {
14856 arg->ch = FORMAT_READ(ctx);
14857 ctx->fmtpos++;
14858 }
14859 }
14860 else if (arg->ch >= '0' && arg->ch <= '9') {
14861 arg->width = arg->ch - '0';
14862 while (--ctx->fmtcnt >= 0) {
14863 arg->ch = FORMAT_READ(ctx);
14864 ctx->fmtpos++;
14865 if (arg->ch < '0' || arg->ch > '9')
14866 break;
14867 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14868 mixing signed and unsigned comparison. Since arg->ch is between
14869 '0' and '9', casting to int is safe. */
14870 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14871 PyErr_SetString(PyExc_ValueError,
14872 "width too big");
14873 return -1;
14874 }
14875 arg->width = arg->width*10 + (arg->ch - '0');
14876 }
14877 }
14878
14879 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014880 if (arg->ch == '.') {
14881 arg->prec = 0;
14882 if (--ctx->fmtcnt >= 0) {
14883 arg->ch = FORMAT_READ(ctx);
14884 ctx->fmtpos++;
14885 }
14886 if (arg->ch == '*') {
14887 v = unicode_format_getnextarg(ctx);
14888 if (v == NULL)
14889 return -1;
14890 if (!PyLong_Check(v)) {
14891 PyErr_SetString(PyExc_TypeError,
14892 "* wants int");
14893 return -1;
14894 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014895 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014896 if (arg->prec == -1 && PyErr_Occurred())
14897 return -1;
14898 if (arg->prec < 0)
14899 arg->prec = 0;
14900 if (--ctx->fmtcnt >= 0) {
14901 arg->ch = FORMAT_READ(ctx);
14902 ctx->fmtpos++;
14903 }
14904 }
14905 else if (arg->ch >= '0' && arg->ch <= '9') {
14906 arg->prec = arg->ch - '0';
14907 while (--ctx->fmtcnt >= 0) {
14908 arg->ch = FORMAT_READ(ctx);
14909 ctx->fmtpos++;
14910 if (arg->ch < '0' || arg->ch > '9')
14911 break;
14912 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14913 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014914 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014915 return -1;
14916 }
14917 arg->prec = arg->prec*10 + (arg->ch - '0');
14918 }
14919 }
14920 }
14921
14922 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14923 if (ctx->fmtcnt >= 0) {
14924 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14925 if (--ctx->fmtcnt >= 0) {
14926 arg->ch = FORMAT_READ(ctx);
14927 ctx->fmtpos++;
14928 }
14929 }
14930 }
14931 if (ctx->fmtcnt < 0) {
14932 PyErr_SetString(PyExc_ValueError,
14933 "incomplete format");
14934 return -1;
14935 }
14936 return 0;
14937
14938#undef FORMAT_READ
14939}
14940
14941/* Format one argument. Supported conversion specifiers:
14942
14943 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014944 - "i", "d", "u": int or float
14945 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014946 - "e", "E", "f", "F", "g", "G": float
14947 - "c": int or str (1 character)
14948
Victor Stinner8dbd4212012-12-04 09:30:24 +010014949 When possible, the output is written directly into the Unicode writer
14950 (ctx->writer). A string is created when padding is required.
14951
Victor Stinnera47082312012-10-04 02:19:54 +020014952 Return 0 if the argument has been formatted into *p_str,
14953 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014954 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014955static int
14956unicode_format_arg_format(struct unicode_formatter_t *ctx,
14957 struct unicode_format_arg_t *arg,
14958 PyObject **p_str)
14959{
14960 PyObject *v;
14961 _PyUnicodeWriter *writer = &ctx->writer;
14962
14963 if (ctx->fmtcnt == 0)
14964 ctx->writer.overallocate = 0;
14965
Victor Stinnera47082312012-10-04 02:19:54 +020014966 v = unicode_format_getnextarg(ctx);
14967 if (v == NULL)
14968 return -1;
14969
Victor Stinnera47082312012-10-04 02:19:54 +020014970
14971 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014972 case 's':
14973 case 'r':
14974 case 'a':
14975 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14976 /* Fast path */
14977 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14978 return -1;
14979 return 1;
14980 }
14981
14982 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14983 *p_str = v;
14984 Py_INCREF(*p_str);
14985 }
14986 else {
14987 if (arg->ch == 's')
14988 *p_str = PyObject_Str(v);
14989 else if (arg->ch == 'r')
14990 *p_str = PyObject_Repr(v);
14991 else
14992 *p_str = PyObject_ASCII(v);
14993 }
14994 break;
14995
14996 case 'i':
14997 case 'd':
14998 case 'u':
14999 case 'o':
15000 case 'x':
15001 case 'X':
15002 {
15003 int ret = mainformatlong(v, arg, p_str, writer);
15004 if (ret != 0)
15005 return ret;
15006 arg->sign = 1;
15007 break;
15008 }
15009
15010 case 'e':
15011 case 'E':
15012 case 'f':
15013 case 'F':
15014 case 'g':
15015 case 'G':
15016 if (arg->width == -1 && arg->prec == -1
15017 && !(arg->flags & (F_SIGN | F_BLANK)))
15018 {
15019 /* Fast path */
15020 if (formatfloat(v, arg, NULL, writer) == -1)
15021 return -1;
15022 return 1;
15023 }
15024
15025 arg->sign = 1;
15026 if (formatfloat(v, arg, p_str, NULL) == -1)
15027 return -1;
15028 break;
15029
15030 case 'c':
15031 {
15032 Py_UCS4 ch = formatchar(v);
15033 if (ch == (Py_UCS4) -1)
15034 return -1;
15035 if (arg->width == -1 && arg->prec == -1) {
15036 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015037 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015038 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015039 return 1;
15040 }
15041 *p_str = PyUnicode_FromOrdinal(ch);
15042 break;
15043 }
15044
15045 default:
15046 PyErr_Format(PyExc_ValueError,
15047 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015048 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015049 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15050 (int)arg->ch,
15051 ctx->fmtpos - 1);
15052 return -1;
15053 }
15054 if (*p_str == NULL)
15055 return -1;
15056 assert (PyUnicode_Check(*p_str));
15057 return 0;
15058}
15059
15060static int
15061unicode_format_arg_output(struct unicode_formatter_t *ctx,
15062 struct unicode_format_arg_t *arg,
15063 PyObject *str)
15064{
15065 Py_ssize_t len;
15066 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015067 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015068 Py_ssize_t pindex;
15069 Py_UCS4 signchar;
15070 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015071 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015072 Py_ssize_t sublen;
15073 _PyUnicodeWriter *writer = &ctx->writer;
15074 Py_UCS4 fill;
15075
15076 fill = ' ';
15077 if (arg->sign && arg->flags & F_ZERO)
15078 fill = '0';
15079
15080 if (PyUnicode_READY(str) == -1)
15081 return -1;
15082
15083 len = PyUnicode_GET_LENGTH(str);
15084 if ((arg->width == -1 || arg->width <= len)
15085 && (arg->prec == -1 || arg->prec >= len)
15086 && !(arg->flags & (F_SIGN | F_BLANK)))
15087 {
15088 /* Fast path */
15089 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15090 return -1;
15091 return 0;
15092 }
15093
15094 /* Truncate the string for "s", "r" and "a" formats
15095 if the precision is set */
15096 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15097 if (arg->prec >= 0 && len > arg->prec)
15098 len = arg->prec;
15099 }
15100
15101 /* Adjust sign and width */
15102 kind = PyUnicode_KIND(str);
15103 pbuf = PyUnicode_DATA(str);
15104 pindex = 0;
15105 signchar = '\0';
15106 if (arg->sign) {
15107 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15108 if (ch == '-' || ch == '+') {
15109 signchar = ch;
15110 len--;
15111 pindex++;
15112 }
15113 else if (arg->flags & F_SIGN)
15114 signchar = '+';
15115 else if (arg->flags & F_BLANK)
15116 signchar = ' ';
15117 else
15118 arg->sign = 0;
15119 }
15120 if (arg->width < len)
15121 arg->width = len;
15122
15123 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015124 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015125 if (!(arg->flags & F_LJUST)) {
15126 if (arg->sign) {
15127 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015128 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015129 }
15130 else {
15131 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015132 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015133 }
15134 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015135 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15136 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015137 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015138 }
15139
Victor Stinnera47082312012-10-04 02:19:54 +020015140 buflen = arg->width;
15141 if (arg->sign && len == arg->width)
15142 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015143 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015144 return -1;
15145
15146 /* Write the sign if needed */
15147 if (arg->sign) {
15148 if (fill != ' ') {
15149 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15150 writer->pos += 1;
15151 }
15152 if (arg->width > len)
15153 arg->width--;
15154 }
15155
15156 /* Write the numeric prefix for "x", "X" and "o" formats
15157 if the alternate form is used.
15158 For example, write "0x" for the "%#x" format. */
15159 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15160 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15161 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15162 if (fill != ' ') {
15163 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15164 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15165 writer->pos += 2;
15166 pindex += 2;
15167 }
15168 arg->width -= 2;
15169 if (arg->width < 0)
15170 arg->width = 0;
15171 len -= 2;
15172 }
15173
15174 /* Pad left with the fill character if needed */
15175 if (arg->width > len && !(arg->flags & F_LJUST)) {
15176 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015177 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015178 writer->pos += sublen;
15179 arg->width = len;
15180 }
15181
15182 /* If padding with spaces: write sign if needed and/or numeric prefix if
15183 the alternate form is used */
15184 if (fill == ' ') {
15185 if (arg->sign) {
15186 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15187 writer->pos += 1;
15188 }
15189 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15190 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15191 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15192 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15193 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15194 writer->pos += 2;
15195 pindex += 2;
15196 }
15197 }
15198
15199 /* Write characters */
15200 if (len) {
15201 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15202 str, pindex, len);
15203 writer->pos += len;
15204 }
15205
15206 /* Pad right with the fill character if needed */
15207 if (arg->width > len) {
15208 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015209 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015210 writer->pos += sublen;
15211 }
15212 return 0;
15213}
15214
15215/* Helper of PyUnicode_Format(): format one arg.
15216 Return 0 on success, raise an exception and return -1 on error. */
15217static int
15218unicode_format_arg(struct unicode_formatter_t *ctx)
15219{
15220 struct unicode_format_arg_t arg;
15221 PyObject *str;
15222 int ret;
15223
Victor Stinner8dbd4212012-12-04 09:30:24 +010015224 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015225 if (arg.ch == '%') {
15226 ctx->fmtpos++;
15227 ctx->fmtcnt--;
15228 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15229 return -1;
15230 return 0;
15231 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015232 arg.flags = 0;
15233 arg.width = -1;
15234 arg.prec = -1;
15235 arg.sign = 0;
15236 str = NULL;
15237
Victor Stinnera47082312012-10-04 02:19:54 +020015238 ret = unicode_format_arg_parse(ctx, &arg);
15239 if (ret == -1)
15240 return -1;
15241
15242 ret = unicode_format_arg_format(ctx, &arg, &str);
15243 if (ret == -1)
15244 return -1;
15245
15246 if (ret != 1) {
15247 ret = unicode_format_arg_output(ctx, &arg, str);
15248 Py_DECREF(str);
15249 if (ret == -1)
15250 return -1;
15251 }
15252
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015253 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015254 PyErr_SetString(PyExc_TypeError,
15255 "not all arguments converted during string formatting");
15256 return -1;
15257 }
15258 return 0;
15259}
15260
Alexander Belopolsky40018472011-02-26 01:02:56 +000015261PyObject *
15262PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015263{
Victor Stinnera47082312012-10-04 02:19:54 +020015264 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015265
Guido van Rossumd57fd912000-03-10 22:53:23 +000015266 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015267 PyErr_BadInternalCall();
15268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015269 }
Victor Stinnera47082312012-10-04 02:19:54 +020015270
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015271 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015272 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015273
15274 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015275 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15276 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15277 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15278 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015279
Victor Stinner8f674cc2013-04-17 23:02:17 +020015280 _PyUnicodeWriter_Init(&ctx.writer);
15281 ctx.writer.min_length = ctx.fmtcnt + 100;
15282 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015283
Guido van Rossumd57fd912000-03-10 22:53:23 +000015284 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015285 ctx.arglen = PyTuple_Size(args);
15286 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015287 }
15288 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015289 ctx.arglen = -1;
15290 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015291 }
Victor Stinnera47082312012-10-04 02:19:54 +020015292 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015293 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015294 ctx.dict = args;
15295 else
15296 ctx.dict = NULL;
15297 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015298
Victor Stinnera47082312012-10-04 02:19:54 +020015299 while (--ctx.fmtcnt >= 0) {
15300 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015301 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015302
15303 nonfmtpos = ctx.fmtpos++;
15304 while (ctx.fmtcnt >= 0 &&
15305 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15306 ctx.fmtpos++;
15307 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015308 }
Victor Stinnera47082312012-10-04 02:19:54 +020015309 if (ctx.fmtcnt < 0) {
15310 ctx.fmtpos--;
15311 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015312 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015313
Victor Stinnercfc4c132013-04-03 01:48:39 +020015314 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15315 nonfmtpos, ctx.fmtpos) < 0)
15316 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015317 }
15318 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015319 ctx.fmtpos++;
15320 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015321 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015322 }
15323 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015324
Victor Stinnera47082312012-10-04 02:19:54 +020015325 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015326 PyErr_SetString(PyExc_TypeError,
15327 "not all arguments converted during string formatting");
15328 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015329 }
15330
Victor Stinnera47082312012-10-04 02:19:54 +020015331 if (ctx.args_owned) {
15332 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015333 }
Victor Stinnera47082312012-10-04 02:19:54 +020015334 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015335
Benjamin Peterson29060642009-01-31 22:14:21 +000015336 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015337 _PyUnicodeWriter_Dealloc(&ctx.writer);
15338 if (ctx.args_owned) {
15339 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015340 }
15341 return NULL;
15342}
15343
Jeremy Hylton938ace62002-07-17 16:30:39 +000015344static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015345unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15346
Tim Peters6d6c1a32001-08-02 04:15:00 +000015347static PyObject *
15348unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15349{
Benjamin Peterson29060642009-01-31 22:14:21 +000015350 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015351 static char *kwlist[] = {"object", "encoding", "errors", 0};
15352 char *encoding = NULL;
15353 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015354
Benjamin Peterson14339b62009-01-31 16:36:08 +000015355 if (type != &PyUnicode_Type)
15356 return unicode_subtype_new(type, args, kwds);
15357 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015358 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015359 return NULL;
15360 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015361 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015362 if (encoding == NULL && errors == NULL)
15363 return PyObject_Str(x);
15364 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015365 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015366}
15367
Guido van Rossume023fe02001-08-30 03:12:59 +000015368static PyObject *
15369unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15370{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015371 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015372 Py_ssize_t length, char_size;
15373 int share_wstr, share_utf8;
15374 unsigned int kind;
15375 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015376
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015378
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015379 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015380 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015381 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015382 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015383 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015384 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015385 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015386 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015387
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015388 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015389 if (self == NULL) {
15390 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015391 return NULL;
15392 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015393 kind = PyUnicode_KIND(unicode);
15394 length = PyUnicode_GET_LENGTH(unicode);
15395
15396 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015397#ifdef Py_DEBUG
15398 _PyUnicode_HASH(self) = -1;
15399#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015400 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015401#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015402 _PyUnicode_STATE(self).interned = 0;
15403 _PyUnicode_STATE(self).kind = kind;
15404 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015405 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015406 _PyUnicode_STATE(self).ready = 1;
15407 _PyUnicode_WSTR(self) = NULL;
15408 _PyUnicode_UTF8_LENGTH(self) = 0;
15409 _PyUnicode_UTF8(self) = NULL;
15410 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015411 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015412
15413 share_utf8 = 0;
15414 share_wstr = 0;
15415 if (kind == PyUnicode_1BYTE_KIND) {
15416 char_size = 1;
15417 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15418 share_utf8 = 1;
15419 }
15420 else if (kind == PyUnicode_2BYTE_KIND) {
15421 char_size = 2;
15422 if (sizeof(wchar_t) == 2)
15423 share_wstr = 1;
15424 }
15425 else {
15426 assert(kind == PyUnicode_4BYTE_KIND);
15427 char_size = 4;
15428 if (sizeof(wchar_t) == 4)
15429 share_wstr = 1;
15430 }
15431
15432 /* Ensure we won't overflow the length. */
15433 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15434 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015435 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015436 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015437 data = PyObject_MALLOC((length + 1) * char_size);
15438 if (data == NULL) {
15439 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015440 goto onError;
15441 }
15442
Victor Stinnerc3c74152011-10-02 20:39:55 +020015443 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015444 if (share_utf8) {
15445 _PyUnicode_UTF8_LENGTH(self) = length;
15446 _PyUnicode_UTF8(self) = data;
15447 }
15448 if (share_wstr) {
15449 _PyUnicode_WSTR_LENGTH(self) = length;
15450 _PyUnicode_WSTR(self) = (wchar_t *)data;
15451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015452
Christian Heimesf051e432016-09-13 20:22:02 +020015453 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015454 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015455 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015456#ifdef Py_DEBUG
15457 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15458#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015459 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015460 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015461
15462onError:
15463 Py_DECREF(unicode);
15464 Py_DECREF(self);
15465 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015466}
15467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015468PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015469"str(object='') -> str\n\
15470str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015471\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015472Create a new string object from the given object. If encoding or\n\
15473errors is specified, then the object must expose a data buffer\n\
15474that will be decoded using the given encoding and error handler.\n\
15475Otherwise, returns the result of object.__str__() (if defined)\n\
15476or repr(object).\n\
15477encoding defaults to sys.getdefaultencoding().\n\
15478errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015479
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015480static PyObject *unicode_iter(PyObject *seq);
15481
Guido van Rossumd57fd912000-03-10 22:53:23 +000015482PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015483 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015484 "str", /* tp_name */
15485 sizeof(PyUnicodeObject), /* tp_basicsize */
15486 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015487 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015488 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015489 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015490 0, /* tp_getattr */
15491 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015492 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015493 unicode_repr, /* tp_repr */
15494 &unicode_as_number, /* tp_as_number */
15495 &unicode_as_sequence, /* tp_as_sequence */
15496 &unicode_as_mapping, /* tp_as_mapping */
15497 (hashfunc) unicode_hash, /* tp_hash*/
15498 0, /* tp_call*/
15499 (reprfunc) unicode_str, /* tp_str */
15500 PyObject_GenericGetAttr, /* tp_getattro */
15501 0, /* tp_setattro */
15502 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015503 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015504 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15505 unicode_doc, /* tp_doc */
15506 0, /* tp_traverse */
15507 0, /* tp_clear */
15508 PyUnicode_RichCompare, /* tp_richcompare */
15509 0, /* tp_weaklistoffset */
15510 unicode_iter, /* tp_iter */
15511 0, /* tp_iternext */
15512 unicode_methods, /* tp_methods */
15513 0, /* tp_members */
15514 0, /* tp_getset */
15515 &PyBaseObject_Type, /* tp_base */
15516 0, /* tp_dict */
15517 0, /* tp_descr_get */
15518 0, /* tp_descr_set */
15519 0, /* tp_dictoffset */
15520 0, /* tp_init */
15521 0, /* tp_alloc */
15522 unicode_new, /* tp_new */
15523 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015524};
15525
15526/* Initialize the Unicode implementation */
15527
Victor Stinner331a6a52019-05-27 16:39:22 +020015528PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015529_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015530{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015531 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015532 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015533 0x000A, /* LINE FEED */
15534 0x000D, /* CARRIAGE RETURN */
15535 0x001C, /* FILE SEPARATOR */
15536 0x001D, /* GROUP SEPARATOR */
15537 0x001E, /* RECORD SEPARATOR */
15538 0x0085, /* NEXT LINE */
15539 0x2028, /* LINE SEPARATOR */
15540 0x2029, /* PARAGRAPH SEPARATOR */
15541 };
15542
Fred Drakee4315f52000-05-09 19:53:39 +000015543 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015544 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015545 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015546 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015547 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015548 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015549
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015550 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015551 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015552 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015553
15554 /* initialize the linebreak bloom filter */
15555 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015556 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015557 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015558
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015559 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015560 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015561 }
15562 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015563 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015564 }
15565 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015566 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015567 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015568 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015569}
15570
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015571
Walter Dörwald16807132007-05-25 13:52:07 +000015572void
15573PyUnicode_InternInPlace(PyObject **p)
15574{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015575 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015576#ifdef Py_DEBUG
15577 assert(s != NULL);
15578 assert(_PyUnicode_CHECK(s));
15579#else
Victor Stinner607b1022020-05-05 18:50:30 +020015580 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015581 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015582 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015583#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015584
Benjamin Peterson14339b62009-01-31 16:36:08 +000015585 /* If it's a subclass, we don't really know what putting
15586 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015587 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015588 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015589 }
15590
15591 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015592 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015593 }
15594
15595#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015596 if (interned == NULL) {
15597 interned = PyDict_New();
15598 if (interned == NULL) {
15599 PyErr_Clear(); /* Don't leave an exception */
15600 return;
15601 }
15602 }
Victor Stinner607b1022020-05-05 18:50:30 +020015603
15604 PyObject *t;
Berker Peksagced8d4c2016-07-25 04:40:39 +030015605 t = PyDict_SetDefault(interned, s, s);
Victor Stinner607b1022020-05-05 18:50:30 +020015606
Berker Peksagced8d4c2016-07-25 04:40:39 +030015607 if (t == NULL) {
15608 PyErr_Clear();
15609 return;
15610 }
Victor Stinner607b1022020-05-05 18:50:30 +020015611
Berker Peksagced8d4c2016-07-25 04:40:39 +030015612 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015613 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015614 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015615 return;
15616 }
Victor Stinner607b1022020-05-05 18:50:30 +020015617
Benjamin Peterson14339b62009-01-31 16:36:08 +000015618 /* The two references in interned are not counted by refcnt.
15619 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015620 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015621 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015622#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015623}
15624
15625void
15626PyUnicode_InternImmortal(PyObject **p)
15627{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015628 PyUnicode_InternInPlace(p);
15629 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015630 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015631 Py_INCREF(*p);
15632 }
Walter Dörwald16807132007-05-25 13:52:07 +000015633}
15634
15635PyObject *
15636PyUnicode_InternFromString(const char *cp)
15637{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015638 PyObject *s = PyUnicode_FromString(cp);
15639 if (s == NULL)
15640 return NULL;
15641 PyUnicode_InternInPlace(&s);
15642 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015643}
15644
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015645
15646#if defined(WITH_VALGRIND) || defined(__INSURE__)
15647static void
15648unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015649{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015650 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015651 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015652 }
15653 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015654 if (keys == NULL || !PyList_Check(keys)) {
15655 PyErr_Clear();
15656 return;
15657 }
Walter Dörwald16807132007-05-25 13:52:07 +000015658
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015659 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015660 detector, interned unicode strings are not forcibly deallocated;
15661 rather, we give them their stolen references back, and then clear
15662 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015663
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015664 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015665#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015666 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015667 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015668
15669 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015670#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015671 for (Py_ssize_t i = 0; i < n; i++) {
15672 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015673 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015674 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015676 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015677 case SSTATE_INTERNED_IMMORTAL:
15678 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015679#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015680 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015681#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015682 break;
15683 case SSTATE_INTERNED_MORTAL:
15684 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015685#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015686 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015687#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015688 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015689 case SSTATE_NOT_INTERNED:
15690 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015691 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015692 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015694 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015695 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015696#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015697 fprintf(stderr, "total size of all interned strings: "
15698 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15699 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015700#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015701 Py_DECREF(keys);
15702 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015703 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015704}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015705#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015706
15707
15708/********************* Unicode Iterator **************************/
15709
15710typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015711 PyObject_HEAD
15712 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015713 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015714} unicodeiterobject;
15715
15716static void
15717unicodeiter_dealloc(unicodeiterobject *it)
15718{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015719 _PyObject_GC_UNTRACK(it);
15720 Py_XDECREF(it->it_seq);
15721 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015722}
15723
15724static int
15725unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15726{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015727 Py_VISIT(it->it_seq);
15728 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015729}
15730
15731static PyObject *
15732unicodeiter_next(unicodeiterobject *it)
15733{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015734 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015735
Benjamin Peterson14339b62009-01-31 16:36:08 +000015736 assert(it != NULL);
15737 seq = it->it_seq;
15738 if (seq == NULL)
15739 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015740 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015742 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15743 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015744 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015745 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15746 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015747 if (item != NULL)
15748 ++it->it_index;
15749 return item;
15750 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015751
Benjamin Peterson14339b62009-01-31 16:36:08 +000015752 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015753 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015754 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015755}
15756
15757static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015758unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015759{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015760 Py_ssize_t len = 0;
15761 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015762 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015763 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015764}
15765
15766PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15767
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015768static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015769unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015770{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015771 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015772 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015773 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015774 it->it_seq, it->it_index);
15775 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015776 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015777 if (u == NULL)
15778 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015779 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015780 }
15781}
15782
15783PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15784
15785static PyObject *
15786unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15787{
15788 Py_ssize_t index = PyLong_AsSsize_t(state);
15789 if (index == -1 && PyErr_Occurred())
15790 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015791 if (it->it_seq != NULL) {
15792 if (index < 0)
15793 index = 0;
15794 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15795 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15796 it->it_index = index;
15797 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015798 Py_RETURN_NONE;
15799}
15800
15801PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15802
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015803static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015804 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015805 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015806 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15807 reduce_doc},
15808 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15809 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015810 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015811};
15812
15813PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015814 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15815 "str_iterator", /* tp_name */
15816 sizeof(unicodeiterobject), /* tp_basicsize */
15817 0, /* tp_itemsize */
15818 /* methods */
15819 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015820 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015821 0, /* tp_getattr */
15822 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015823 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015824 0, /* tp_repr */
15825 0, /* tp_as_number */
15826 0, /* tp_as_sequence */
15827 0, /* tp_as_mapping */
15828 0, /* tp_hash */
15829 0, /* tp_call */
15830 0, /* tp_str */
15831 PyObject_GenericGetAttr, /* tp_getattro */
15832 0, /* tp_setattro */
15833 0, /* tp_as_buffer */
15834 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15835 0, /* tp_doc */
15836 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15837 0, /* tp_clear */
15838 0, /* tp_richcompare */
15839 0, /* tp_weaklistoffset */
15840 PyObject_SelfIter, /* tp_iter */
15841 (iternextfunc)unicodeiter_next, /* tp_iternext */
15842 unicodeiter_methods, /* tp_methods */
15843 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015844};
15845
15846static PyObject *
15847unicode_iter(PyObject *seq)
15848{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015849 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015850
Benjamin Peterson14339b62009-01-31 16:36:08 +000015851 if (!PyUnicode_Check(seq)) {
15852 PyErr_BadInternalCall();
15853 return NULL;
15854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015855 if (PyUnicode_READY(seq) == -1)
15856 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015857 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15858 if (it == NULL)
15859 return NULL;
15860 it->it_index = 0;
15861 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015862 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015863 _PyObject_GC_TRACK(it);
15864 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015865}
15866
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015867
15868size_t
15869Py_UNICODE_strlen(const Py_UNICODE *u)
15870{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015871 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015872}
15873
15874Py_UNICODE*
15875Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15876{
15877 Py_UNICODE *u = s1;
15878 while ((*u++ = *s2++));
15879 return s1;
15880}
15881
15882Py_UNICODE*
15883Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15884{
15885 Py_UNICODE *u = s1;
15886 while ((*u++ = *s2++))
15887 if (n-- == 0)
15888 break;
15889 return s1;
15890}
15891
15892Py_UNICODE*
15893Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15894{
15895 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015896 u1 += wcslen(u1);
15897 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015898 return s1;
15899}
15900
15901int
15902Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15903{
15904 while (*s1 && *s2 && *s1 == *s2)
15905 s1++, s2++;
15906 if (*s1 && *s2)
15907 return (*s1 < *s2) ? -1 : +1;
15908 if (*s1)
15909 return 1;
15910 if (*s2)
15911 return -1;
15912 return 0;
15913}
15914
15915int
15916Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15917{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015918 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015919 for (; n != 0; n--) {
15920 u1 = *s1;
15921 u2 = *s2;
15922 if (u1 != u2)
15923 return (u1 < u2) ? -1 : +1;
15924 if (u1 == '\0')
15925 return 0;
15926 s1++;
15927 s2++;
15928 }
15929 return 0;
15930}
15931
15932Py_UNICODE*
15933Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15934{
15935 const Py_UNICODE *p;
15936 for (p = s; *p; p++)
15937 if (*p == c)
15938 return (Py_UNICODE*)p;
15939 return NULL;
15940}
15941
15942Py_UNICODE*
15943Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15944{
15945 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015946 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015947 while (p != s) {
15948 p--;
15949 if (*p == c)
15950 return (Py_UNICODE*)p;
15951 }
15952 return NULL;
15953}
Victor Stinner331ea922010-08-10 16:37:20 +000015954
Victor Stinner71133ff2010-09-01 23:43:53 +000015955Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015956PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015957{
Victor Stinner577db2c2011-10-11 22:12:48 +020015958 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015959 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015961 if (!PyUnicode_Check(unicode)) {
15962 PyErr_BadArgument();
15963 return NULL;
15964 }
Inada Naoki610a60c2020-06-18 17:30:53 +090015965_Py_COMP_DIAG_PUSH
15966_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015967 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Inada Naoki610a60c2020-06-18 17:30:53 +090015968_Py_COMP_DIAG_POP
Victor Stinner577db2c2011-10-11 22:12:48 +020015969 if (u == NULL)
15970 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015971 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015972 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015973 PyErr_NoMemory();
15974 return NULL;
15975 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015976 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015977 size *= sizeof(Py_UNICODE);
15978 copy = PyMem_Malloc(size);
15979 if (copy == NULL) {
15980 PyErr_NoMemory();
15981 return NULL;
15982 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015983 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015984 return copy;
15985}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015986
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015987
Victor Stinner709d23d2019-05-02 14:56:30 -040015988static int
15989encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015990{
Victor Stinner709d23d2019-05-02 14:56:30 -040015991 int res;
15992 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15993 if (res == -2) {
15994 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15995 return -1;
15996 }
15997 if (res < 0) {
15998 PyErr_NoMemory();
15999 return -1;
16000 }
16001 return 0;
16002}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016003
Victor Stinner709d23d2019-05-02 14:56:30 -040016004
16005static int
16006config_get_codec_name(wchar_t **config_encoding)
16007{
16008 char *encoding;
16009 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16010 return -1;
16011 }
16012
16013 PyObject *name_obj = NULL;
16014 PyObject *codec = _PyCodec_Lookup(encoding);
16015 PyMem_RawFree(encoding);
16016
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016017 if (!codec)
16018 goto error;
16019
16020 name_obj = PyObject_GetAttrString(codec, "name");
16021 Py_CLEAR(codec);
16022 if (!name_obj) {
16023 goto error;
16024 }
16025
Victor Stinner709d23d2019-05-02 14:56:30 -040016026 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16027 Py_DECREF(name_obj);
16028 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016029 goto error;
16030 }
16031
Victor Stinner709d23d2019-05-02 14:56:30 -040016032 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16033 if (raw_wname == NULL) {
16034 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016035 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016036 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016037 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016038
16039 PyMem_RawFree(*config_encoding);
16040 *config_encoding = raw_wname;
16041
16042 PyMem_Free(wname);
16043 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016044
16045error:
16046 Py_XDECREF(codec);
16047 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016048 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016049}
16050
16051
Victor Stinner331a6a52019-05-27 16:39:22 +020016052static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016053init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016054{
Victor Stinner709d23d2019-05-02 14:56:30 -040016055 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016056 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016057 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016058 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016059 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016060 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016061 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016062}
16063
16064
Victor Stinner709d23d2019-05-02 14:56:30 -040016065static int
16066init_fs_codec(PyInterpreterState *interp)
16067{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016068 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016069
16070 _Py_error_handler error_handler;
16071 error_handler = get_error_handler_wide(config->filesystem_errors);
16072 if (error_handler == _Py_ERROR_UNKNOWN) {
16073 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16074 return -1;
16075 }
16076
16077 char *encoding, *errors;
16078 if (encode_wstr_utf8(config->filesystem_encoding,
16079 &encoding,
16080 "filesystem_encoding") < 0) {
16081 return -1;
16082 }
16083
16084 if (encode_wstr_utf8(config->filesystem_errors,
16085 &errors,
16086 "filesystem_errors") < 0) {
16087 PyMem_RawFree(encoding);
16088 return -1;
16089 }
16090
Victor Stinner3d17c042020-05-14 01:48:38 +020016091 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16092 PyMem_RawFree(fs_codec->encoding);
16093 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016094 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016095 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16096 PyMem_RawFree(fs_codec->errors);
16097 fs_codec->errors = errors;
16098 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016099
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016100#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016101 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016102#endif
16103
Victor Stinner709d23d2019-05-02 14:56:30 -040016104 /* At this point, PyUnicode_EncodeFSDefault() and
16105 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16106 the C implementation of the filesystem encoding. */
16107
16108 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16109 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016110 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16111 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016112 PyErr_NoMemory();
16113 return -1;
16114 }
16115 return 0;
16116}
16117
16118
Victor Stinner331a6a52019-05-27 16:39:22 +020016119static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016120init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016121{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016122 PyInterpreterState *interp = tstate->interp;
16123
Victor Stinner709d23d2019-05-02 14:56:30 -040016124 /* Update the filesystem encoding to the normalized Python codec name.
16125 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16126 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016127 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016128 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016129 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016130 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016131 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016132 }
16133
Victor Stinner709d23d2019-05-02 14:56:30 -040016134 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016135 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016136 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016137 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016138}
16139
16140
Victor Stinner331a6a52019-05-27 16:39:22 +020016141PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016142_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016143{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016144 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016145 if (_PyStatus_EXCEPTION(status)) {
16146 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016147 }
16148
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016149 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016150}
16151
16152
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016153static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016154_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016155{
Victor Stinner3d17c042020-05-14 01:48:38 +020016156 PyMem_RawFree(fs_codec->encoding);
16157 fs_codec->encoding = NULL;
16158 fs_codec->utf8 = 0;
16159 PyMem_RawFree(fs_codec->errors);
16160 fs_codec->errors = NULL;
16161 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016162}
16163
16164
Victor Stinner709d23d2019-05-02 14:56:30 -040016165#ifdef MS_WINDOWS
16166int
16167_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16168{
Victor Stinner81a7be32020-04-14 15:14:01 +020016169 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016170 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016171
16172 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16173 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16174 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16175 if (encoding == NULL || errors == NULL) {
16176 PyMem_RawFree(encoding);
16177 PyMem_RawFree(errors);
16178 PyErr_NoMemory();
16179 return -1;
16180 }
16181
16182 PyMem_RawFree(config->filesystem_encoding);
16183 config->filesystem_encoding = encoding;
16184 PyMem_RawFree(config->filesystem_errors);
16185 config->filesystem_errors = errors;
16186
16187 return init_fs_codec(interp);
16188}
16189#endif
16190
16191
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016192void
Victor Stinner3d483342019-11-22 12:27:50 +010016193_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016194{
Victor Stinner3d483342019-11-22 12:27:50 +010016195 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016196#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016197 /* Insure++ is a memory analysis tool that aids in discovering
16198 * memory leaks and other memory problems. On Python exit, the
16199 * interned string dictionaries are flagged as being in use at exit
16200 * (which it is). Under normal circumstances, this is fine because
16201 * the memory will be automatically reclaimed by the system. Under
16202 * memory debugging, it's a huge source of useless noise, so we
16203 * trade off slower shutdown for less distraction in the memory
16204 * reports. -baw
16205 */
16206 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016207#endif /* __INSURE__ */
16208
Victor Stinner3d483342019-11-22 12:27:50 +010016209 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016210
Victor Stinner607b1022020-05-05 18:50:30 +020016211#ifdef LATIN1_SINGLETONS
Victor Stinner3d483342019-11-22 12:27:50 +010016212 for (Py_ssize_t i = 0; i < 256; i++) {
16213 Py_CLEAR(unicode_latin1[i]);
16214 }
Victor Stinner607b1022020-05-05 18:50:30 +020016215#endif
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016216 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016217 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016218
Victor Stinner3d17c042020-05-14 01:48:38 +020016219 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016220}
16221
16222
Georg Brandl66c221e2010-10-14 07:04:07 +000016223/* A _string module, to export formatter_parser and formatter_field_name_split
16224 to the string.Formatter class implemented in Python. */
16225
16226static PyMethodDef _string_methods[] = {
16227 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16228 METH_O, PyDoc_STR("split the argument as a field name")},
16229 {"formatter_parser", (PyCFunction) formatter_parser,
16230 METH_O, PyDoc_STR("parse the argument as a format string")},
16231 {NULL, NULL}
16232};
16233
16234static struct PyModuleDef _string_module = {
16235 PyModuleDef_HEAD_INIT,
16236 "_string",
16237 PyDoc_STR("string helper module"),
16238 0,
16239 _string_methods,
16240 NULL,
16241 NULL,
16242 NULL,
16243 NULL
16244};
16245
16246PyMODINIT_FUNC
16247PyInit__string(void)
16248{
16249 return PyModule_Create(&_string_module);
16250}
16251
16252
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016253#ifdef __cplusplus
16254}
16255#endif